OptBandCV <- function(hFrom, hTo, hBy,
forcing.use, forcing.win,
cutpoint,
outcome.use, outcome.win,
print = TRUE) {
## OptBandCV
## Created by Devin Caughey on 21 January 2011
## Last modified: 12 June 2011
##
## This function tests a number of bandwidth options (determined by
## `hFrom', `hTo', and `hBy') to see which is optimal (in terms of
## having the lowest mean squared error) for estimating the
## relationship between the forcing and outcome variables at a
## boundary point (the "cutpoint" in an RD design). Unlike other
## versions of this function, the weighting function for the local
## regression is a triangular (edge) kernel, which is optimal for
## boundary estimation. The function returns the optimal bandwidth,
## the RD estimate at that bandwidth, and a matrix containing the
## bandwidths tested and their respective CV criteria. It is an
## improved version of an older function, `hOptCV_tri'.
## As of 12 April 2011, the weighting function is vectorized and is
## therefore an order of magnitude faster.
##
## OptBandCV takes the following arguments:
## - `hFrom': Lower limit of range values of h to test
## - `hTo': Upper limit of range of values of h to test
## - `hBy': Increment between each value of h tested
## - forcing.use: The "forcing" variable, which determines
## assignment to treatment and whose relationship with the outcome
## is being modeled.
## - `forcing.win': A proper subset of forcing.use, every value of
## which is used as a "test" cutpoint. Note that the range of
## forcing.win must be sufficiently less than that of forcing.use so
## that the largest value of h tested does not extend beyond the
## range of the data. That is, the following conditions must be
## satisfied:
## (1) max(forcing.use) > max(forcing.win) + hTo
## (2) min(forcing.use) < min(forcing.win) - hTo
## - `outcome.use': the outcome variable, whose difference at the
## cutpoint is the quantity of interest.
## - `outcome.win': A proper subset of outcome.use, corresponding to
## the observations included in forcing.win.
## - `cutpoint': The value of the forcing variable at which
## treatment is assigned. Test cutpoints below (left of) the actual
## cutpoint are estimated using only data to their left, and vice
## versa for test cutpoints above (right of) the actual cutpoint.
## - `print': A logical value indicating whether the results should
## be printed or kept invisible.
## Check for NAs and stop if any are found
na <- c(0, 0, 0, 0)
if(sum(is.na(forcing.use)) > 0) {
na[1] <- 1
}
if(sum(is.na(forcing.win)) > 0) {
na[2] <- 1
}
if(sum(is.na(outcome.use)) > 0) {
na[3] <- 1
}
if(sum(is.na(outcome.win)) > 0) {
na[4] <- 1
}
if(sum(na) > 0) {
naMsgs <- c("NAs found in forcing.use.\n",
"NAs found in forcing.win.\n",
"NAs found in outcome.use.\n",
"NAs found in outcome.win.\n")
stop(naMsgs[which(na == 1)])
}
## Check whether the range of forcing.use is sufficiently larger
## than that of forcing.win
if(max(forcing.use) <= max(forcing.win) + hTo |
min(forcing.use) >= min(forcing.win) - hTo) {
stop("The range of forcing.use is too small relative to the range of forcing.win.")
}
## weights, estimate at point, difference in estimate at point
EdgeKernel <- function(X, mid, bw, norm = FALSE) {
## Invisibly returns a vector of edge (triangular) kernel weights.
## If an observation's distance from the midpoint is greater than
## bw, it receives zero weight. If it is within the bandwidth, it
## receives a weight equal to 1 minus the distance over the
## bandwidth.
## 'norm' regulates whether the weights should be normalized to a
## mean of 1 (and thus to a sum equal to the number of non-missing
## observations)
wts.out <- ifelse(abs(X - mid) > bw, 0, 1 - (abs(X - mid) / bw))
if(norm) {
mean.wts <- mean(wts.out, na.rm = TRUE)
wts.out <- wts.out / mean.wts
}
invisible(wts.out)
}
EdgeLocalEst <- function(Y, X, bw, point,
norm = FALSE, print = FALSE,
verbose = FALSE) {
## Returns the estimate of the local regression function at
## 'point', with weights determined by an edge kernel with a
## bandwidth of bw.
Xcentered <- X - point
wts.c <- EdgeKernel(X = Xcentered, mid = 0, bw = bw, norm = TRUE)
llr.c <- lm(Y ~ Xcentered, weights = wts.c)
if(verbose) {
print(summary(llr.c))
}
PointEst <- coef(llr.c)[[1]]
SEofEst <- sqrt(diag(vcov(llr.c)))[1]
names(SEofEst) <- NULL
out <- list(PointEst, SEofEst)
names(out) <- c("Point Estimate", "Standard Error of Estimate")
if(print) {
return(out)
}
if(!print) {
invisible(out)
}
}
EdgeLocalDiffEst <- function(Y, X, bw, point,
norm = FALSE, print = FALSE,
verbose = FALSE) {
## Returns the difference in the value of a function at a given
## point estimated from above and from below, using local linear
## regression with an edge kernel.
Xcentered <- X - point
wts.c <- EdgeKernel(X = Xcentered, mid = 0, bw = bw, norm = TRUE)
trmt <- as.numeric(Xcentered >= 0)
llr.c <- lm(Y ~ trmt + Xcentered + I(trmt * Xcentered),
weights = wts.c)
if(verbose) {
print(summary(llr.c))
}
DiffEst <- coef(llr.c)[[2]]
SEofEst <- sqrt(diag(vcov(llr.c)))[2]
names(SEofEst) <- NULL
out <- list(DiffEst, SEofEst)
names(out) <- c("Estimated Difference", "Standard Error of Estimate")
if(print) {
return(out)
}
if(!print) {
invisible(out)
}
}
## Bandwidth (h) options to test
h <- seq(from = hFrom, to = hTo, by = hBy)
## Vector to store CV criterion
cv <- vector(mode = 'numeric', length = length(h))
is.na(cv) <- TRUE
## Number of observations in testing window
n.win <- length(forcing.win)
## Loop over values of h
for(j in 1:length(h)) {
## Create variable to store predicted Y's
y.hat <- vector(mode = 'numeric', length = n.win)
is.na(y.hat) <- TRUE
## Loop over observations in testing window
for(i in 1:n.win) {
## Assign i's value of forcing variable to 'c'
c <- forcing.win[i]
## If c is left of the cut point
if(c < cutpoint) {
## Assign values of forcing variable less than c to x.bw
x.bw <- forcing.use[forcing.use < c]
## Assign values of outcome variable less than c to y.bw
y.bw <- outcome.use[forcing.use < c]
}
## If c is right of the cut point
if(c >= cutpoint) {
## Assign values of forcing variable greater than c to x.bw
x.bw <- forcing.use[forcing.use > c]
## Assign values of outcome variable greater than c to y.bw
y.bw <- outcome.use[forcing.use > c]
}
## Fitted value at c
y.hat[i] <- EdgeLocalEst(Y = y.bw, X = x.bw,
point = c, bw = h[j])[[1]]
}
## MSE of h[j]
cv[j] <- sum((outcome.win - y.hat) ^ 2) / n.win
if(print) {
print(data.frame(bandwidth = h[j], MSE = cv[j], time = Sys.time()))
}
}
## Optimal bandwidth
h.opt <- h[which(cv == min(cv))]
if(length(h.opt) != 1) {
cat("Warning: Multiple values of h are tied for the minimum cross-validation criterion. These are:",
paste(h.opt, sep = ", "))
}
## RD estimate using optimal bandwidth
rd.est <- EdgeLocalDiffEst(Y = outcome.use,
X = forcing.use,
point = cutpoint,
bw = h.opt)
## Output
output <- list(h.opt, rd.est, cbind(h, cv))
names(output) <- c("Optimal Bandwidth (h.opt)",
"RD Estimate Using h.opt",
"Bandwidth/CV-Criterion Matrix")
if(print) {
return(output)
} else {
invisible(output)
}
}