## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----include = FALSE, echo = FALSE-------------------------------------------- Sys.setenv(LANGUAGE="en") ## ----echo = FALSE------------------------------------------------------------- library(robust2sls) library(ivreg) # for 2sls regression ## ----------------------------------------------------------------------------- param <- generate_param(dx1 = 1, dx2 = 1, dz2 = 1, intercept = TRUE, beta = c(2,-1), sigma = 1, seed = 2) data_full <- generate_data(parameters = param, n = 1000)$data # have a look at the data head(data_full) ## ----------------------------------------------------------------------------- cor(data_full$u, data_full$x2) # correlation for endogenous regressor cor(data_full$u, data_full$z2) # close to zero correlation for valid instrument cor(data_full$x2, data_full$z2) # correlation for informativeness ## ----------------------------------------------------------------------------- fml_ols <- y ~ x2 ols <- lm(formula = fml_ols, data = data_full[, 1:3]) print(ols$coefficients) fml_tsls <- y ~ x2 | z2 tsls <- ivreg(formula = fml_tsls, data = data_full[, c(1:3, 6)]) print(tsls$coefficients) ## ----------------------------------------------------------------------------- # extract the variables we observe data <- data.frame(data_full[, c(1:3, 6)]) # not iterating the algorithm robustified_0 <- outlier_detection(data = data, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "robustified", iterations = 0) print(robustified_0) # iterating algorithm until convergence robustified_conv <- outlier_detection(data = data, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "robustified", iterations = "convergence", convergence_criterion = 0) print(robustified_conv) ## ----------------------------------------------------------------------------- sum(abs(data_full[, "u"]) > 2.58) ## ----------------------------------------------------------------------------- # which observations had large true errors? large_true <- which(abs(data_full[, "u"]) > 2.58) # which observations were detected as outliers # both step 0 and iterated version had same starting point, so same initial classification large_detected_0 <- which(robustified_conv$type$m0 == 0) large_detected_3 <- which(robustified_conv$type$m3 == 0) # how much do the sets of true and detected large errors overlap? sum(large_detected_0 %in% large_true) # 12 of the 13 detected outliers really had large errors sum(large_detected_3 %in% large_true) # 13 of the 15 detected outliers really had large errors # which were wrongly detected as having large errors? ind <- large_detected_3[which(!(large_detected_3 %in% large_true))] # what were their true error values? data_full[ind, "u"] # relatively large values even though technically smaller than cut-off # which large true errors were missed? ind2 <- large_true[which(!(large_true %in% large_detected_3))] # what were their true error values? data_full[ind2, "u"] # one of them is close to the cut-off ## ----------------------------------------------------------------------------- # not iterating the algorithm saturated_0 <- outlier_detection(data = data, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "saturated", iterations = 0, split = 0.5) print(saturated_0) # iterating algorithm until convergence saturated_conv <- outlier_detection(data = data, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "saturated", split = 0.5, iterations = "convergence", convergence_criterion = 0) print(saturated_conv) # which did it find in final selection? large_detected_4 <- which(saturated_conv$type$m4 == 0) identical(large_detected_3, large_detected_4) ## ----------------------------------------------------------------------------- # final model (iteration 3) without corrected standard errors summary(robustified_conv$model$m3)$coef # final model with corrected standard errors, m = 3 (subset of output) beta_inf(robustified_conv, iteration = 3, exact = TRUE)[, 1:5] # final model with corrected standard errors, m = "fixed point" / "converged" (subset of output) beta_inf(robustified_conv, iteration = 3, exact = TRUE, fp = TRUE)[, 1:5] ## ----------------------------------------------------------------------------- # use case re-sampling (to save time, use low iteration m = 1) resampling <- case_resampling(robustified_conv, R = 1000, m = 1) beta_boot <- evaluate_boot(resampling, iterations = 1) mat <- matrix(beta_boot[, 1:2], ncol = 1, nrow = 2) # show subset, put into column vector colnames(mat) <- "bootStd. Error" cbind(beta_inf(robustified_conv, iteration = 1, exact = TRUE)[, 1:3], mat) ## ----------------------------------------------------------------------------- # t-test on a single coefficient # testing difference in beta_2 (coef on endogenous regressor x2), m = 3 beta_t(robustified_conv, iteration = 3, element = "x2") # testing difference in beta_2 (coef on endogenous regressor x2), m = "fixed point" beta_t(robustified_conv, iteration = 3, element = "x2", fp = TRUE) # Hausman-type test on whole coefficient vector, m = 3 beta_hausman(robustified_conv, iteration = 3) ## ----------------------------------------------------------------------------- data_full_contaminated <- data_full outlier_location <- sample(1:NROW(data_full), size = 0.03*NROW(data_full), replace = FALSE) outlier_size <- sample(c(-3.5, 3.5), size = length(outlier_location), replace = TRUE) # replace errors data_full_contaminated[outlier_location, "u"] <- outlier_size # replace value of dependent variable data_full_contaminated[outlier_location, "y"] <- data_full_contaminated[outlier_location, "y"] - data_full[outlier_location, "u"] + data_full_contaminated[outlier_location, "u"] # extract the data that researcher would actually collect data_cont <- data.frame(data_full_contaminated[, c(1:3, 6)]) ## ----------------------------------------------------------------------------- # not iterating the algorithm robustified_0 <- outlier_detection(data = data_cont, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "robustified", iterations = 0) print(robustified_0) # iterating algorithm until convergence robustified_conv <- outlier_detection(data = data_cont, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "robustified", iterations = "convergence", convergence_criterion = 0) print(robustified_conv) ## ----------------------------------------------------------------------------- # which observations were classified as outliers? detected_0 <- which(robustified_conv$type$m0 == 0) detected_7 <- which(robustified_conv$type$m7 == 0) # overlap between deterministic outliers sum(outlier_location %in% detected_0) # initial found all 30 (+1 in addition) sum(outlier_location %in% detected_7) # converged found all 30 (+ 15 in addition) ## ----------------------------------------------------------------------------- # full sample model tsls_cont <- ivreg(formula = fml_tsls, data = data_cont) tsls_cont$coefficients # updated estimates after initial classification and removal of outliers robustified_conv$model$m1$coefficients # updated estimates after convergence robustified_conv$model$m7$coefficients ## ----------------------------------------------------------------------------- # use case re-sampling (to save time, use low iteration m = 1) resampling <- case_resampling(robustified_conv, R = 1000, m = 1) beta_boot <- evaluate_boot(resampling, iterations = 1) mat <- matrix(beta_boot[, 1:2], ncol = 1, nrow = 2) # show subset, put into column vector colnames(mat) <- "bootStd. Error" cbind(beta_inf(robustified_conv, iteration = 1, exact = TRUE)[, 1:3], mat) ## ----------------------------------------------------------------------------- # not iterating the algorithm saturated_0 <- outlier_detection(data = data_cont, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "saturated", iterations = 0, split = 0.5) print(saturated_0) # iterating algorithm until convergence saturated_conv <- outlier_detection(data = data_cont, formula = fml_tsls, ref_dist = "normal", sign_level = 0.01, initial_est = "saturated", split = 0.5, iterations = "convergence", convergence_criterion = 0) print(saturated_conv) # which did it find in final selection? detected_7_sat <- which(saturated_conv$type$m7 == 0) identical(detected_7, detected_7_sat)