Problem Set #06 Solutions
Statistics 506

Stratified Bootstrap

library(nycflights13)
library(parallel)
library(future)
data(flights)
flights <- as.data.frame(flights)
point_est <- aggregate(flights$air_time,
                       by = list(flights$origin),
                       FUN = mean, na.rm = TRUE)
point_est

  Group.1        x
1     EWR 153.3000
2     JFK 178.3490
3     LGA 117.8258

First, let’s find a way to carry out the bootstrap resampling. There are of course many ways this can be done.

# Obtain samples of row numbers within each `dest`
a <- aggregate(row.names(flights),
               by = list(flights$dest),
               FUN = function(x) {
                 sample(x, replace = TRUE)
               })
# Get the resampled data
resamp <- flights[Reduce(c, a$x), ]
# Make sure dimensions are the same
identical(dim(resamp), dim(flights))

[1] TRUE

# Make sure sampling was done within `dest`
identical(sort(table(flights$dest)),
          sort(table(resamp$dest)))

[1] TRUE

Define the function:

f <- function(dat) {
  a <- aggregate(row.names(dat),
                 by = list(dat$dest),
                 FUN = function(x) {
                   sample(x, replace = TRUE)
                 })
  resamp <- dat[Reduce(c, a$x), ]
  results <- aggregate(resamp$air_time,
                       by = list(resamp$origin),
                       FUN = mean, na.rm = TRUE)
  return(results)
}

f(flights)

  Group.1        x
1     EWR 153.4715
2     JFK 178.0668
3     LGA 117.8855

system.time(f(flights))

   user  system elapsed 
  0.425   0.019   0.446

This is fine, but slow. (Also, several students mentioned that their solution was significantly faster than mine!) Let’s try data.table.

library(data.table)
flightsdt <- data.table(flights)
f2 <- function(data) {
  resamp <- data[, .SD[sample(x = .N, size = .N, replace = TRUE)], by = dest]
  return(resamp[, .(mean(air_time, na.rm = TRUE)), by = origin])
  return(results)
}

f2(flightsdt)

   origin       V1
1:    EWR 153.7132
2:    LGA 117.8721
3:    JFK 177.8557

system.time(f2(flightsdt))

   user  system elapsed 
  0.059   0.002   0.062

Much better!

Carry out the simulation

reps <- 1000

system.time({
  res1 <- lapply(seq_len(reps),
                 function(x) f2(flightsdt))
})

   user  system elapsed 
 77.493   3.785  82.118

Repeat with mclapply.

system.time({
  res2 <- mclapply(seq_len(reps),
                   function(x) f2(flightsdt),
                   mc.cores = 8)
})

   user  system elapsed 
114.302   6.314  25.100

Repeat a third time with futures.

plan(multisession)
system.time({
  res3 <- lapply(seq_len(reps),
                 function(x) {
                   future(f2(flightsdt), seed = TRUE)
                 })
  res3 <- lapply(res3, value)
})

   user  system elapsed 
 77.182  14.823 148.079

From here, compute the bootrap SEs. Note that using the bootstap SE to compute the confidence interval is one approach; another valid approach would be to estimate the CI directly with the empirical distribution.

produce_table <- function(res) {
  sds <- rbindlist(res)[, sd(V1), by=origin][, V1]

  results <- data.frame(est = point_est$x,
                        lb = point_est$x - 1.96*sds,
                        ub = point_est$x + 1.96*sds)
  rownames(results) <- point_est$Group.1
  results
}

# lapply
produce_table(res1)

         est       lb       ub
EWR 153.3000 152.8880 153.7120
JFK 178.3490 178.1138 178.5843
LGA 117.8258 117.3700 118.2816

# mclapply
produce_table(res2)

         est       lb       ub
EWR 153.3000 152.8913 153.7088
JFK 178.3490 178.1113 178.5868
LGA 117.8258 117.3817 118.2699

# futures
produce_table(res3)

         est       lb       ub
EWR 153.3000 153.0624 153.5377
JFK 178.3490 177.9334 178.7647
LGA 117.8258 117.3790 118.2726