OptBandCV <- function(hFrom, hTo, hBy, forcing.use, forcing.win, cutpoint, outcome.use, outcome.win, print = TRUE) { ## OptBandCV ## Created by Devin Caughey on 21 January 2011 ## Last modified: 12 June 2011 ## ## This function tests a number of bandwidth options (determined by ## `hFrom', `hTo', and `hBy') to see which is optimal (in terms of ## having the lowest mean squared error) for estimating the ## relationship between the forcing and outcome variables at a ## boundary point (the "cutpoint" in an RD design). Unlike other ## versions of this function, the weighting function for the local ## regression is a triangular (edge) kernel, which is optimal for ## boundary estimation. The function returns the optimal bandwidth, ## the RD estimate at that bandwidth, and a matrix containing the ## bandwidths tested and their respective CV criteria. It is an ## improved version of an older function, `hOptCV_tri'. ## As of 12 April 2011, the weighting function is vectorized and is ## therefore an order of magnitude faster. ## ## OptBandCV takes the following arguments: ## - `hFrom': Lower limit of range values of h to test ## - `hTo': Upper limit of range of values of h to test ## - `hBy': Increment between each value of h tested ## - forcing.use: The "forcing" variable, which determines ## assignment to treatment and whose relationship with the outcome ## is being modeled. ## - `forcing.win': A proper subset of forcing.use, every value of ## which is used as a "test" cutpoint. Note that the range of ## forcing.win must be sufficiently less than that of forcing.use so ## that the largest value of h tested does not extend beyond the ## range of the data. That is, the following conditions must be ## satisfied: ## (1) max(forcing.use) > max(forcing.win) + hTo ## (2) min(forcing.use) < min(forcing.win) - hTo ## - `outcome.use': the outcome variable, whose difference at the ## cutpoint is the quantity of interest. ## - `outcome.win': A proper subset of outcome.use, corresponding to ## the observations included in forcing.win. ## - `cutpoint': The value of the forcing variable at which ## treatment is assigned. Test cutpoints below (left of) the actual ## cutpoint are estimated using only data to their left, and vice ## versa for test cutpoints above (right of) the actual cutpoint. ## - `print': A logical value indicating whether the results should ## be printed or kept invisible. ## Check for NAs and stop if any are found na <- c(0, 0, 0, 0) if(sum(is.na(forcing.use)) > 0) { na[1] <- 1 } if(sum(is.na(forcing.win)) > 0) { na[2] <- 1 } if(sum(is.na(outcome.use)) > 0) { na[3] <- 1 } if(sum(is.na(outcome.win)) > 0) { na[4] <- 1 } if(sum(na) > 0) { naMsgs <- c("NAs found in forcing.use.\n", "NAs found in forcing.win.\n", "NAs found in outcome.use.\n", "NAs found in outcome.win.\n") stop(naMsgs[which(na == 1)]) } ## Check whether the range of forcing.use is sufficiently larger ## than that of forcing.win if(max(forcing.use) <= max(forcing.win) + hTo | min(forcing.use) >= min(forcing.win) - hTo) { stop("The range of forcing.use is too small relative to the range of forcing.win.") } ## weights, estimate at point, difference in estimate at point EdgeKernel <- function(X, mid, bw, norm = FALSE) { ## Invisibly returns a vector of edge (triangular) kernel weights. ## If an observation's distance from the midpoint is greater than ## bw, it receives zero weight. If it is within the bandwidth, it ## receives a weight equal to 1 minus the distance over the ## bandwidth. ## 'norm' regulates whether the weights should be normalized to a ## mean of 1 (and thus to a sum equal to the number of non-missing ## observations) wts.out <- ifelse(abs(X - mid) > bw, 0, 1 - (abs(X - mid) / bw)) if(norm) { mean.wts <- mean(wts.out, na.rm = TRUE) wts.out <- wts.out / mean.wts } invisible(wts.out) } EdgeLocalEst <- function(Y, X, bw, point, norm = FALSE, print = FALSE, verbose = FALSE) { ## Returns the estimate of the local regression function at ## 'point', with weights determined by an edge kernel with a ## bandwidth of bw. Xcentered <- X - point wts.c <- EdgeKernel(X = Xcentered, mid = 0, bw = bw, norm = TRUE) llr.c <- lm(Y ~ Xcentered, weights = wts.c) if(verbose) { print(summary(llr.c)) } PointEst <- coef(llr.c)[[1]] SEofEst <- sqrt(diag(vcov(llr.c)))[1] names(SEofEst) <- NULL out <- list(PointEst, SEofEst) names(out) <- c("Point Estimate", "Standard Error of Estimate") if(print) { return(out) } if(!print) { invisible(out) } } EdgeLocalDiffEst <- function(Y, X, bw, point, norm = FALSE, print = FALSE, verbose = FALSE) { ## Returns the difference in the value of a function at a given ## point estimated from above and from below, using local linear ## regression with an edge kernel. Xcentered <- X - point wts.c <- EdgeKernel(X = Xcentered, mid = 0, bw = bw, norm = TRUE) trmt <- as.numeric(Xcentered >= 0) llr.c <- lm(Y ~ trmt + Xcentered + I(trmt * Xcentered), weights = wts.c) if(verbose) { print(summary(llr.c)) } DiffEst <- coef(llr.c)[[2]] SEofEst <- sqrt(diag(vcov(llr.c)))[2] names(SEofEst) <- NULL out <- list(DiffEst, SEofEst) names(out) <- c("Estimated Difference", "Standard Error of Estimate") if(print) { return(out) } if(!print) { invisible(out) } } ## Bandwidth (h) options to test h <- seq(from = hFrom, to = hTo, by = hBy) ## Vector to store CV criterion cv <- vector(mode = 'numeric', length = length(h)) is.na(cv) <- TRUE ## Number of observations in testing window n.win <- length(forcing.win) ## Loop over values of h for(j in 1:length(h)) { ## Create variable to store predicted Y's y.hat <- vector(mode = 'numeric', length = n.win) is.na(y.hat) <- TRUE ## Loop over observations in testing window for(i in 1:n.win) { ## Assign i's value of forcing variable to 'c' c <- forcing.win[i] ## If c is left of the cut point if(c < cutpoint) { ## Assign values of forcing variable less than c to x.bw x.bw <- forcing.use[forcing.use < c] ## Assign values of outcome variable less than c to y.bw y.bw <- outcome.use[forcing.use < c] } ## If c is right of the cut point if(c >= cutpoint) { ## Assign values of forcing variable greater than c to x.bw x.bw <- forcing.use[forcing.use > c] ## Assign values of outcome variable greater than c to y.bw y.bw <- outcome.use[forcing.use > c] } ## Fitted value at c y.hat[i] <- EdgeLocalEst(Y = y.bw, X = x.bw, point = c, bw = h[j])[[1]] } ## MSE of h[j] cv[j] <- sum((outcome.win - y.hat) ^ 2) / n.win if(print) { print(data.frame(bandwidth = h[j], MSE = cv[j], time = Sys.time())) } } ## Optimal bandwidth h.opt <- h[which(cv == min(cv))] if(length(h.opt) != 1) { cat("Warning: Multiple values of h are tied for the minimum cross-validation criterion. These are:", paste(h.opt, sep = ", ")) } ## RD estimate using optimal bandwidth rd.est <- EdgeLocalDiffEst(Y = outcome.use, X = forcing.use, point = cutpoint, bw = h.opt) ## Output output <- list(h.opt, rd.est, cbind(h, cv)) names(output) <- c("Optimal Bandwidth (h.opt)", "RD Estimate Using h.opt", "Bandwidth/CV-Criterion Matrix") if(print) { return(output) } else { invisible(output) } }