Statistics 506, Fall 2016

Due October 18

1) Solve problem 2 from the first problem set using R.

Solution (part a):

library(readxl)

df = read_excel("/nfs/kshedden/506/SchichDataS1_FB.xlsx")

for (x in c("BYear", "DYear")) {
    df[[x]] = as.numeric(df[[x]])
}

ii = (df$BYear >= 1700) & (df$BYear <= 1900)
ii = ii & !is.na(df$DYear)
df1 = df[ii,]

df1$BDec = 10 * floor(df1$BYear / 10)
df1$Lifespan = df1$DYear - df1$BYear

result = NULL
decs = seq(1700, 1900, by=10)
for (dec in decs) {

    ii = (df1$BDec == dec)
    df2 = df1[ii,]
    ls = NULL
    for (gender in c("Female", "Male")) {
        jj = (df2$Gender == gender)
        x = mean(df2[jj, "Lifespan"])
        ls = c(ls, x)
    }
    result = rbind(result, ls)
}

rownames(result) = decs

Solution (part b):

hav = function(x) { (1 - cos(x)) / 2}

phi1 = 2 * pi * df1$BLocLat / 360
phi2 = 2 * pi * df1$DLocLat / 360
lam1 = 2 * pi * df1$BLocLon / 360
lam2 = 2 * pi * df1$DLocLon / 360

d = sqrt(hav(phi2 - phi1) + cos(phi1) * cos(phi2) * hav(lam2 - lam1))
d = asin(d)
df1$bd_dist = 2 * 6371 * d

result = NULL
for (dec in decs) {

    ii = (df1$BDec == dec)
    df2 = df1[ii,]
    ls = NULL
    for (gender in c("Female", "Male")) {
        jj = (df2$Gender == gender)
        x = mean(df2[jj, "bd_dist"])
        ls = c(ls, x)
    }
    result = rbind(result, ls)
}

rownames(result) = decs
colnames(result) = c("Female", "Male")

Solution (part c):

df1$x = cos(lam1) * sin(pi/2 - phi1)
df1$y = sin(lam1) * sin(pi/2 - phi1)
df1$z = cos(pi/2 - phi1)

result = NULL
for (dec in decs) {

    ii = (df1$BDec == dec)
    df2 = df1[ii,]
    df2 = df2[c("x", "y", "z")]

    psn = apply(df2, 2, mean)
    psn = psn / sqrt(sum(psn^2))
    result = rbind(result, c(dec, psn))
}

result = cbind(result, pi/2 - acos(result[,"z"]))
result = cbind(result, atan(result[,"y"] / result[,"x"]))
colnames(result)[5:6] = c("Lat", "Lon")

result[,"Lat"] = 360 * result[,"Lat"] / (2*pi)
result[,"Lon"] = 360 * result[,"Lon"] / (2*pi)

2) Suppose we wish to construct confidence intervals for the difference between the means of two populations based on independent samples of size m and n from the populations. Suppose we treat the two populations as having the same variance, and we construct the usual 95% confidence interval for the mean difference using the pooled variance. Use simulation to estimate the actual coverage probabilities of these intervals when there is heteroscedasticity, i.e. the variances differ by a factor k > 1. Choose several different population settings and report the results. When estimating the coverage probability for a single setting, do not use any loops.

Solution:

The code is below. I did not ask for any interpretation of the results, but if I had, it would have been important to note that the pooled variance procedure is always conservative (the actual coverage is greater than the nominal coverage), and when the sample sizes are similar, the degree of conservatism is smaller than when the sample sizes are not similar. Note that below I only consider the case where the larger sample comes from the population with greater variance.

# Number of simulation replications
mcrep = 10000

# Simulate two-sample Gaussian data with a defined level of
# heteroscedasticity.  The returned value contains many data sets
# stored by row.
gendat= function(m, n, k) {

    x = rnorm(m*mcrep)
    x = array(x, c(mcrep, m))

    y = rnorm(n*mcrep)
    y = k * array(y, c(mcrep, n))

    return(list(x=x, y=y))
}

# Use vectorization to calculate the pooled variance for every row of
# x/y.
pooled_var = function(x, y) {
    xc = x - rowMeans(x)
    yc = y - rowMeans(y)
    pvar = apply(xc*xc, 1, sum)
    pvar = pvar + apply(yc*yc, 1, sum)
    pvar = pvar / (m + n - 2)
    return(pvar)
}

# Construct an array of results
result = NULL
for (m in c(10, 20)) {
    for (n in c(20, 30, 40, 50)) {

        # These constants are used in costructing the CI's
        f = qt(0.975, m + n - 2)
        q = f * sqrt(1/m + 1/n)

        for (k in c(1, 2, 5, 10)) {

            # Simulate data
            v = gendat(m, n, k)
            x = v$x
            y = v$y
            pv = pooled_var(x, y)

            # Construct the confidence intervals
            md = rowMeans(x) - rowMeans(y)
            lcb = md - q*sqrt(pv)
            ucb = md + q*sqrt(pv)

            # Determine the coverage rate
            coverage = mean((lcb < 0) * (ucb > 0))
            row = c(m, n, k, coverage)
            result = rbind(result, row)
        }
    }
}

row.names(result) = NULL

3) Under certain situations, the variance of the sample correlation coefficient is approximately 1/n, where n is the sample size. The approximation holds best when the true correlation is close to zero. Consider at least five different distributions, use simulation to produce a large collection of correlation coefficients for data drawn from these distributions, and use these draws to estimate the variance of the correlation coefficients. You can restrict your consideration to settings where the true correlation coefficient is zero.

Solution:

The code is below. I used t-distributions with various degrees of freedom to generate the data. I did not ask for any discussion, but if I had, the key finding is that at least in the situations considered below, the actual variance of the sample correlation coefficient very closely agrees with the approximation given by 1/n. To the extent that there are deviations between the theory-based and actual results, the actual variances are slightly higher when the degrees of freedom and/or sample size are small.

# Sample data from five t-distributions with these degree of freedom
# values.
tdf = c(1, 2, 5, 10, 20)

# The sample size (the number of paired values used to estimate each
# correlation coefficient).
n = 30

# Number of simulation replications
mcrep = 10000

# Use vectorization to standardize the rows of x.
vstandardize = function(x) {
    x = x - rowMeans(x)
    x = x / apply(x, 1, sd)
    return(x)
}

# Use vectorization to calculate the correlation coefficients between
# corresponding rows of x and y.
vcor = function(x, y) {
    x = vstandardize(x)
    y = vstandardize(y)
    r = apply(x * y, 1, mean)
    return(r)
}

for (tv in tdf) {
    for (n in c(10, 20, 30)) {

        x = rt(mcrep * n, df=tv)
        x = array(x, c(mcrep, n))
        y = rt(mcrep * n, df=tv)
        y = array(y, c(mcrep, n))

        r = vcor(x, y)

        print(c(n, tv, 1/sqrt(n), sd(r)))
    }
}

4) The first order autocorrelation of a sequence f[1], f[2], … is the correlation between f[i] and f[i-1] for any i. The first order autocorrelation does not need to be constant, but here we will only consider situations where it is constant. In this exercise, you will write code to generate a sequence of values f in one of the two following ways, where x is a vector whose components are iid and standardized (i.e. they have zero mean and unit variance):

For constants a and b, let f[i] = a*f[i-1] + b*x[i] for i>1, and f[1] = x[1].
For constants a and b, let f[i] = a*x[i] + b*x[i-1] for i>1, and f[1] = x[1].

Choose one or the other of these two approaches, and derive expressions for constants a and b such that the first order autocorrelation in the sequence f is a given value r (you do not need to consider both approaches). Then implement only the approach you chose to derive in R. If you use the first approach you can use either the convolve function, or use loops (it is not easy to vectorize in other ways). The second approach is easily vectorized.

Next, use simulation to estimate the standard error of the sample correlation coefficient between two independent sequences f and g, generated according to the approach developed above. Repeat this process for several values of r, and describe how the standard error varies with r.

Solution:

The code is below, for both of the models described above. The greater the autocorrelation, the greater the standard error for the sample correlation coefficient. Positively autocorrelated data contain less information than independent data, so there is more uncertainty about parameters that are estimated from the data.

# Number of simulation replications
mcrep = 10000

# Generate AR data with autocorrelation parameter r.  The result is an
# array with independent rows, such that within each row the data
# follow an AR process.
genar = function(r, n) {
    x = rnorm(n * mcrep)
    x = array(x, c(mcrep, n))
    for (j in 2:n) {
        x[,j] = r*x[,j-1] + sqrt(1 - r^2)*x[,j]
    }
    return(x)
}

# Generate MA data with autocorrelation parameter r.  The result is an
# array with independent rows, such that within each row the data
# follow an MA process.  Note that r<0.5 is required here.
genma = function(r, n) {
    x = rnorm(n * mcrep)
    x = array(x, c(mcrep, n))

    f = array(0, c(mcrep, n))
    q = (1 + sqrt(1 - 4*r*r)) / 2
    q = sqrt(q)
    f[,2:n] = q*x[,2:n] + sqrt(1-q^2)*x[,1:(n-1)]
    return(f)
}

# Use vectorization to standardize the rows of x.
vstandardize = function(x) {
    x = x - rowMeans(x)
    x = x / apply(x, 1, sd)
    return(x)
}

# Use vectorization to calculate the correlation coefficients between
# corresponding rows of x and y.
vcor = function(x, y) {
    x = vstandardize(x)
    y = vstandardize(y)
    r = apply(x * y, 1, mean)
    return(r)
}

# Do the simulation with AR data.
print("AR")
for (n in c(10, 20, 30)) {
    for (r in c(0, 0.25, 0.5, 0.75)) {

        x = genar(r, n)
        y = genar(r, n)

        rh = vcor(x, y)
        print(c(n, r, 1/sqrt(n), sd(rh)))
    }
}

# Do the simulation with MA data.
print("MA")
for (n in c(10, 20, 30)) {
    for (r in c(0, 0.25, 0.5)) {

        x = genma(r, n)
        y = genma(r, n)

        rh = vcor(x, y)
        print(c(n, r, 1/sqrt(n), sd(rh)))
    }
}