From dfef56afe2f99ed355e8cca00f8b1ca696877844 Mon Sep 17 00:00:00 2001 From: Yiqing Xu Date: Sat, 18 Feb 2017 13:06:38 -0800 Subject: [PATCH] CRAN version --- DESCRIPTION | 7 +- NAMESPACE | 19 +- R/RcppExports.R | 0 R/interflex.R | 3686 +++++++++++++++++++------------------- README.md | 4 +- man/inter.binning.Rd | 17 +- man/inter.gam.Rd | 3 +- man/inter.kernel.Rd | 18 +- man/inter.raw.Rd | 7 +- man/interflex-package.Rd | 6 +- man/s1.Rd | 10 + man/s2.Rd | 10 + man/s3.Rd | 10 + man/s4.Rd | 9 + src/RcppExports.cpp | 0 src/fastplm.cpp | 600 +++---- src/symbols.rds | Bin 0 -> 5186 bytes stata/interflex.ado | 176 +- 18 files changed, 2352 insertions(+), 2230 deletions(-) mode change 100644 => 100755 DESCRIPTION mode change 100644 => 100755 NAMESPACE mode change 100644 => 100755 R/RcppExports.R mode change 100644 => 100755 R/interflex.R mode change 100644 => 100755 README.md mode change 100644 => 100755 man/inter.binning.Rd mode change 100644 => 100755 man/inter.gam.Rd mode change 100644 => 100755 man/inter.kernel.Rd mode change 100644 => 100755 man/inter.raw.Rd mode change 100644 => 100755 man/interflex-package.Rd create mode 100755 man/s1.Rd create mode 100755 man/s2.Rd create mode 100755 man/s3.Rd create mode 100755 man/s4.Rd mode change 100644 => 100755 src/RcppExports.cpp mode change 100644 => 100755 src/fastplm.cpp create mode 100755 src/symbols.rds diff --git a/DESCRIPTION b/DESCRIPTION old mode 100644 new mode 100755 index 764b8e8..f9d8cd8 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,16 +5,15 @@ Version: 1.0.2 Date: 2017-1-25 Author: Jens Hainmueller, Jonathan Mummolo and Yiqing Xu Maintainer: Yiqing Xu -Description: Performs diagnostic tests of multiplicative interaction models and plots non-linear marginal effects of a treatment on an outcome across different values of a moderator +Description: Performs diagnostic tests of multiplicative interaction models and plots non-linear marginal effects of a treatment on an outcome across different values of a moderator. License: GPL-2 Imports: Rcpp (>= 0.12.3), ggplot2 (>= 2.1.0), - mgcv (>= 1.8-16), sandwich (>= 2.3-4), pcse (>= 1.9), - plotrix (>= 3.6-4), lmtest (>= 0.9-34), Lmoments (>= 1.2-3), doParallel (>= 1.0.10), - foreach (>= 1.4.3) + foreach (>= 1.4.3), + mgcv(>= 1.8-16) LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE old mode 100644 new mode 100755 index d55da2d..62eab70 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,25 @@ useDynLib(interflex) importFrom(Rcpp, evalCpp) +importFrom("grDevices", "col2rgb", "graphics.off", "pdf", "rgb") +importFrom("parallel", "detectCores", "stopCluster", "makeCluster") +importFrom("graphics", "hist", "par", "plot") +importFrom("stats", "as.formula", "density", "dnorm", "lm", "median", + "model.matrix", "na.omit", "pt", "qt", "quantile", "sd", + "var", "vcov") +importFrom("ggplot2","geom_errorbar", "geom_hline", "geom_line", "geom_point", "geom_rect", + "geom_ribbon", "geom_smooth", "geom_vline", "ggplot", "ggtitle", "theme", "aes", + "element_text", "annotate", "xlab", "ylab", "aes_string", "coord_cartesian", + "facet_wrap", "facet_grid") +importFrom("mgcv","vis.gam","gam") +importFrom("sandwich","sandwich","estfun") +importFrom("pcse","pcse") +importFrom("lmtest","waldtest") +importFrom("Lmoments","Lmoments") +importFrom("foreach","foreach","%dopar%") +importFrom("doParallel","registerDoParallel") # exportPattern("^[[:alpha:]]+") export(inter.raw) export(inter.gam) export(inter.binning) export(inter.kernel) -export(fastplm) \ No newline at end of file +# export(fastplm) \ No newline at end of file diff --git a/R/RcppExports.R b/R/RcppExports.R old mode 100644 new mode 100755 diff --git a/R/interflex.R b/R/interflex.R old mode 100644 new mode 100755 index 7381a23..7af82cf --- a/R/interflex.R +++ b/R/interflex.R @@ -1,1833 +1,1853 @@ -## Jens Hainmueller; Jonathan Mummolo; Yiqing Xu -## This Version: 2015.12.05 - -###### Interpreting Interaction Models ####### - -## 1. inter.raw: first look at the data: D, X, Y -## 2. inter.gam: GAM plots -## 3. inter.binning: estimat -## 4. inter.kernel: kernel smooth plot - -## Supporting -## 1. coefs: kernel weighted least squares -## 2. crossvalidation: cross validate kernel bandwidth -## 3. vcovCluster: clustered standard error - - -################################################# - -inter.raw<-function(Y,D,X,weights=NULL,data, - nbins=3,cutoffs=NULL, span=NULL, - Ylabel=NULL,Dlabel= NULL,Xlabel=NULL, - pos=NULL){ - - ## Y: outcome - ## D: "treatment" indicator - ## X: covariate to be interacted with D - ## nbins: ## of bins - ## cutoffs: specified cut-points - ## span: bandwidth for loess - - ## check input - if (is.character(Y) == FALSE) { - stop("Y is not a string.") - } else { - Y <- Y[1] - } - if (is.character(D) == FALSE) { - stop("D is not a string.") - } else { - D <- D[1] - } - if (is.character(X) == FALSE) { - stop("X is not a string.") - } else { - X <- X[1] - } - if (is.null(weights) == FALSE) { - if (is.character(weights) == FALSE) { - stop("weigths is not a string.") - } else { - weights <- weights[1] - } - } - if (is.data.frame(data) == FALSE) { - stop("Not a data frame.") - } - if (is.null(nbins) == FALSE) { - if (nbins%%1 != 0) { - stop("nbins is not a positive integer.") - } else { - nbins <- nbins[1] - } - if (nbins < 1) { - stop("nbins is not a positive integer.") - } - } - if (is.null(cutoffs) == FALSE) { - if (is.numeric(cutoffs) == FALSE) { - stop("Some element of cutoffs is not numeric.") - } - } - if (is.null(span) == FALSE) { - if (is.numeric(span) == FALSE) { - stop("span is not numeric.") - } else { - span <- span[1] - if (span <= 0) { - stop("span is not a positive number.") - } - } - } - if (is.null(Ylabel)==TRUE) { - Ylabel <- Y - } else { - if (is.character(Ylabel) == FALSE) { - stop("Ylabel is not a string.") - } else { - Ylabel <- Ylabel[1] - } - } - if (is.null(Dlabel)==TRUE) { - Dlabel <- D - } else { - if (is.character(Dlabel) == FALSE) { - stop("Dlabel is not a string.") - } else { - Dlabel <- Dlabel[1] - } - } - if (is.null(Xlabel)==TRUE) { - Xlabel <- X - } else { - if (is.character(Xlabel) == FALSE) { - stop("Xlabel is not a string.") - } else { - Xlabel <- Xlabel[1] - } - } - - ## load packages - library(ggplot2) - - ## drop missing values - data <- na.omit(data[,c(Y, D, X)]) - - ## ploting - if (length(unique(data[,D]))==2) { ## binary case - - ## plotting - treat.lab<-c(paste(Dlabel,"= 0"),paste(Dlabel,"= 1")) - data.aug<-data - data.aug$treat<-factor(data[,D], labels=treat.lab) - - if (is.null(pos)==TRUE) { - box.pos.co <- min(data[,Y]) - box.pos.tr <- max(data[,Y]) - } else { - box.pos.co <- pos[1] - box.pos.tr <- pos[2] - } - - tr <- which(data[,D]==1) - co <- which(data[,D]==0) - qt90.tr <- quantile(data[tr,X],c(0.05,0.95)) - qt90.co <- quantile(data[co,X],c(0.05,0.95)) - qt50.tr <- quantile(data[tr,X],c(0.25,0.75)) - qt50.co <- quantile(data[co,X],c(0.25,0.75)) - med.tr <- median(data[tr,X]) - med.co <- median(data[co,X]) - - data.aug$qt90 <- NA - data.aug$qt90[which(data.aug[,D]==1 & - data.aug[,X]>=qt90.tr[1] & - data.aug[,X]<=qt90.tr[2])]<- box.pos.tr - data.aug$qt90[which(data.aug[,D]==0 & - data.aug[,X]>=qt90.co[1] & - data.aug[,X]<=qt90.co[2])]<- box.pos.co - data.aug$qt50 <- NA - data.aug$qt50[which(data.aug[,D]==1 & - data.aug[,X]>=qt50.tr[1] & - data.aug[,X]<=qt50.tr[2])]<- box.pos.tr - data.aug$qt50[which(data.aug[,D]==0 & - data.aug[,X]>=qt50.co[1] & - data.aug[,X]<=qt50.co[2])]<- box.pos.co - data.aug$med <- NA - data.aug$med[tr[which.min(abs(data.aug[tr,X]-med.tr))]]<- box.pos.tr - data.aug$med[co[which.min(abs(data.aug[co,X]-med.co))]]<- box.pos.co - - - ## plotting - if (is.null(weights)==TRUE) { - p1 <- ggplot(data.aug, aes_string(X, Y)) - } else { - p1 <- ggplot(data.aug, aes_string(X, Y, weight=weights)) - - } - p1 <- p1 + geom_point() + - geom_smooth(method = "lm", se = F, fullrange = T, - colour = "steelblue", size = 1) - - if (is.null(span)==TRUE) { - p1 <- p1+ geom_smooth(method = "loess", formula = y ~ x, - se = F, colour="red") - } else { - p1 <- p1 + geom_smooth(method = "loess", formula = y ~ x, - se = F, colour="red",span=span) - } - p1 <- p1 + xlab(Xlabel) + ylab(Ylabel) - - - p1 <- p1 + geom_line(aes_string(X,"qt90"), - size=1,colour="grey50", na.rm = TRUE) - p1 <- p1 + geom_line(aes_string(X,"qt50"), - size=3,colour="grey50", na.rm = TRUE) - p1 <- p1 + geom_point(aes_string(X,"med"),size=3, - shape=21,fill="white",colour="red", na.rm = TRUE) - - p1 <- p1 + theme(axis.title = element_text(size=15)) - - p1 <- p1 + facet_wrap(~treat, ncol=1) - #p1 <- p1 + facet_grid(treat ~.) - - - - } else { # continuous case - - ## grouping by X - if (is.null(cutoffs)==TRUE) { - cutoff<-quantile(data[,X],probs=seq(0,1,1/nbins)) - while (length(unique(cutoff))!=nbins+1) { - nbins<-nbins-1 - cutoff<-quantile(data[,X],probs=seq(0,1,1/nbins)) - } - } else { - cutoffs <-cutoffs[which(cutoffs>min(data[,X]) & cutoffs < max(data[,X]))] - cutoff<- sort(unique(c(min(data[,X]),cutoffs,max(data[,X])))) - } - groupID<-cut(data[,X],breaks=cutoff, labels = FALSE) - groupID[which(data[,X]==min(data[,X]))]<-1 - - ## X labels - groupID2 <- cut(data[,X],breaks=cutoff) - gp.lab = paste(Xlabel, ": ", levels(groupID2), sep="") - gp.lab[1] <- paste(Xlabel, ": [", substring(levels(groupID2)[1],2), sep = "") - nbins <- length(unique(groupID)) - - ## if (nbins==2) { - ## gp.lab<-c(paste(Xlabel,": low",sep=""),paste(Xlabel,": high",sep="")) - ## } else if (nbins==3) { - ## gp.lab<-c(paste(Xlabel,": low",sep=""),paste(Xlabel,": medium",sep=""), - ## paste(Xlabel,": high",sep="")) - ## } else { - ## gp.lab<-c(); - ## for (i in 1:nbins) { - ## gp.lab<-c(gp.lab,paste("Grp",i)) - ## } - ## } - - groupID <- factor(groupID, labels=gp.lab) - data.aug <- data - data.aug$groupID<-groupID - - ## plotting - if (is.null(weights)==TRUE) { - p1 <- ggplot(data.aug, aes_string(D, Y)) - } else { - p1 <- ggplot(data.aug, aes_string(D, Y,weight=weights)) - } - p1 <- p1 + geom_point() + - geom_smooth(method = "lm", se = F, fullrange = T, - colour = "steelblue", size = 1) - if (is.null(span)==TRUE) { - p1 <- p1 + - geom_smooth(method = "loess", formula = y ~ x, - se = F, colour="red") - } else { - p1 <- p1 + - geom_smooth(method = "loess", formula = y ~ x, - se = F, colour="red",span=span) - } - - p1 <- p1 + xlab(Dlabel) + ylab(Ylabel) + facet_grid(.~groupID) - - } - return(list(graph = p1)) -} - - -############################################################# -## GAM - - -inter.gam<-function(Y,D,X,Z=NULL,weights=NULL,FE=NULL,data, - SE = FALSE, - k=10, - angle=c(30,100,-30,-120), - Ylabel = NULL, Dlabel = NULL, Xlabel = NULL){ - - ## check input - if (is.character(Y) == FALSE) { - stop("Y is not a string.") - } else { - Y <- Y[1] - } - if (is.character(D) == FALSE) { - stop("D is not a string.") - } else { - D <- D[1] - } - if (is.character(X) == FALSE) { - stop("X is not a string.") - } else { - X <- X[1] - } - if (is.null(Z) == FALSE) { - for (i in 1:length(Z)) { - if (is.character(Z[i]) == FALSE) { - stop("Some element in Z is not a string.") - } - } - } - if (is.null(FE) == FALSE) { - for (i in 1:length(FE)) { - if (is.character(FE[i]) == FALSE) { - stop("Some element in FE is not a string.") - } - } - } - if (is.null(weights) == FALSE) { - if (is.character(weights) == FALSE) { - stop("weigths is not a string.") - } else { - weights <- weights[1] - } - } - if (is.data.frame(data) == FALSE) { - stop("Not a data frame.") - } - if (is.logical(SE) == FALSE & is.numeric(SE)==FALSE) { - stop("SE is not a logical flag.") - } - if (is.null(k) == FALSE) { - if (is.numeric(k) == FALSE) { - stop("k is not numeric.") - } else { - k <- k[1] - if (k <= 0) { - stop("k is not a positive number.") - } - } - } - if (length(angle)>=5 | length(angle)<1) { - stop("angle must have length 1 to 4.") - } else { - if (is.numeric(angle)==FALSE) { - stop("Some element in angle is not numeric.") - } - } - if (is.null(Ylabel)==TRUE) { - Ylabel <- Y - } else { - if (is.character(Ylabel) == FALSE) { - stop("Ylabel is not a string.") - } else { - Ylabel <- Ylabel[1] - } - } - if (is.null(Dlabel)==TRUE) { - Dlabel <- D - } else { - if (is.character(Dlabel) == FALSE) { - stop("Dlabel is not a string.") - } else { - Dlabel <- Dlabel[1] - } - } - if (is.null(Xlabel)==TRUE) { - Xlabel <- X - } else { - if (is.character(Xlabel) == FALSE) { - stop("Xlabel is not a string.") - } else { - Xlabel <- Xlabel[1] - } - } - if (length(unique(data[,D]))==2) { - warning("The treatment variable is dichotomous.") - } - - ## drop missing values - data <- na.omit(data[,c(Y, D, X, Z)]) - - require(mgcv) - if (is.null(FE)==FALSE) { - if (is.null(Z)==TRUE) {Z<-c()} - for (i in 1:length(FE)) { - Z<-c(Z,paste("as.factor(",FE[i],")",sep="")) - } - } - - if (is.null(Z)==TRUE) { # no controls - formula<-as.formula(paste(Y,"~","s(",D,",",X,",k=",k,")")) - } else { - formula<-as.formula(paste(Y,"~","s(",D,",",X,",k=",k,")+", - paste(Z,collapse="+"))) - } - if (is.null(weights)==TRUE) { - model<-gam(formula, data=data) - } else { - model<-gam(formula, data=data,weights=data[,weights]) - } - par(mfrow=c(2,2),mar=c(2,2,0,0)) - if (SE==0) { - for (i in angle) { - vis.gam(model, view=c(D,X), type="response",cex.lab=1.5, - xlab = Dlabel, ylab = Xlabel, zlab=Ylabel, - ticktype="detailed",plot.type="persp",n.grid=40,too.far=0.5,theta=i,phi=20) - } - } else { - for (i in angle) { - vis.gam(model, view=c(D, X), type="response",cex.lab=1.5, - xlab = Dlabel, ylab = Xlabel, zlab=Ylabel, - ticktype="detailed",plot.type="persp",se=2,n.grid=40,too.far=0.5,theta=30,phi=20) - } - } -} - - -#################################################### - -inter.binning<-function( - Y, # outcome - D, # treatment indicator - X, # moderator - Z = NULL, # covariates - FE = NULL, # fixed effects - weights = NULL, # weigthing variable - data, - na.rm = FALSE, - nbins = 3, # No. of X bins - cutoffs = NULL, - vartype = "homoscedastic", # variance type - ## "homoscedastic" (default); "robust"; "cluster", "pcse" - cl = NULL, # variable to be clustered on - time = NULL, # time variable for pcse - pairwise = TRUE, # pcse option - figure = TRUE, - main = NULL, - Ylabel = NULL, - Dlabel = NULL, - Xlabel = NULL, - xlim = NULL, - ylim = NULL, - interval = NULL, - Xdistr = "histogram", # c("density","histogram") - wald = TRUE - ){ - - - - ## check input - if (is.character(Y) == FALSE) { - stop("Y is not a string.") - } else { - Y <- Y[1] - } - if (is.character(D) == FALSE) { - stop("D is not a string.") - } else { - D <- D[1] - } - if (is.character(X) == FALSE) { - stop("X is not a string.") - } else { - X <- X[1] - } - if (is.null(Z) == FALSE) { - for (i in 1:length(Z)) { - if (is.character(Z[i]) == FALSE) { - stop("Some element in Z is not a string.") - } - } - } - if (is.null(FE) == FALSE) { - for (i in 1:length(FE)) { - if (is.character(FE[i]) == FALSE) { - stop("Some element in FE is not a string.") - } - } - } - if (is.null(weights) == FALSE) { - if (is.character(weights) == FALSE) { - stop("weigths is not a string.") - } else { - weights <- weights[1] - } - } - if (is.data.frame(data) == FALSE) { - stop("Not a data frame.") - } - if (is.logical(na.rm) == FALSE & is.numeric(na.rm)==FALSE) { - stop("na.rm is not a logical flag.") - } - if (is.null(nbins) == FALSE) { - if (nbins%%1 != 0) { - stop("nbins is not a positive integer.") - } else { - nbins <- nbins[1] - } - if (nbins < 1) { - stop("nbins is not a positive integer.") - } - } - if (is.null(cutoffs) == FALSE) { - if (is.numeric(cutoffs) == FALSE) { - stop("Some element of cutoffs is not numeric.") - } - } - if (is.null(vartype)==TRUE) { - vartype <- "homoscedastic" - } - if (!vartype %in% c("homoscedastic","robust","cluster","pcse")){ - stop("vartype must be one of the following: \"homoscedastic\",\"robust\",\"cluster\",\"pcse\".") - } else if (vartype == "cluster") { - if (is.null(cl)==TRUE) { - stop("cl not specified; set cl = \"varname\".") - } - } else if (vartype == "pcse") { - if (is.null(cl)==TRUE | is.null(time)==TRUE) { - stop("cl or time not specified; set cl = \"varname1\", time = \"varname2\":.") - } - } - if (is.null(cl)==FALSE) { - if (is.character(cl) == FALSE) { - stop("cl is not a string.") - } else { - cl <- cl[1] - if (vartype != "pcse") { - vartype <- "cluster" - } - } - } - if (is.null(time)==FALSE) { - if (is.character(time) == FALSE) { - stop("time is not a string.") - } else { - time <- time[1] - } - } - if (is.logical(pairwise) == FALSE & is.numeric(pairwise)==FALSE) { - stop("pairwise is not a logical flag.") - } - if (is.null(Ylabel)==TRUE) { - Ylabel <- Y - } else { - if (is.character(Ylabel) == FALSE) { - stop("Ylabel is not a string.") - } else { - Ylabel <- Ylabel[1] - } - } - if (is.null(Dlabel)==TRUE) { - Dlabel <- D - } else { - if (is.character(Dlabel) == FALSE) { - stop("Dlabel is not a string.") - } else { - Dlabel <- Dlabel[1] - } - } - if (is.null(Xlabel)==TRUE) { - Xlabel <- X - } else { - if (is.character(Xlabel) == FALSE) { - stop("Xlabel is not a string.") - } else { - Xlabel <- Xlabel[1] - } - } - if (is.null(main)==FALSE) { - main <- main[1] - } - if (!Xdistr %in% c("hist","histogram","density")){ - stop("Xdistr must be either \"histogram\" or \"density\".") - } - if (is.null(xlim)==FALSE) { - if (is.numeric(xlim)==FALSE) { - stop("Some element in xlim is not numeric.") - } else { - if (length(xlim)!=2) { - stop("xlim must be of length 2.") - } - } - } - if (is.null(ylim)==FALSE) { - if (is.numeric(ylim)==FALSE) { - stop("Some element in ylim is not numeric.") - } else { - if (length(ylim)!=2) { - stop("ylim must be of length 2.") - } - } - } - - ## check missing values - if (na.rm == TRUE) { - data <- na.omit(data[,c(Y, D, X, Z, FE)]) - } else { - if (sum(is.na(data[,c(Y, D, X, Z, FE)]))>0) { - stop("Missing values. Try option na.rm = TRUE\n") - } - } - - ################################# - - library(ggplot2) - n<-dim(data)[1] - data[,D]<-as.numeric(data[,D]) - - ## parsing fixed effects - if (is.null(FE)==FALSE) { - if (is.null(Z)==TRUE) {Z<-c()} - for (i in 1:length(FE)) { - Z<-c(Z,paste("as.factor(",FE[i],")",sep="")) - } - } - - ## a naive fit - if (is.null(Z)==FALSE) { - mod.f<-as.formula(paste(Y,"~",D,"+",X,"+",D,"*",X,"+",paste(Z,collapse="+"),sep="")) - } else { - mod.f<-as.formula(paste(Y,"~",D,"+",X,"+",D,"*",X,sep="")) - } - if (is.null(weights)==TRUE) { - mod.naive<-lm(mod.f,data=data) - } else { - mod.naive<-lm(mod.f,data=data,weights=data[,weights]) - } - ## coefficients - coefs<-summary(mod.naive)$coefficients[,1] - coef.D<-coefs[D] - coef.X<-coefs[X] - coef.DX<-coefs[paste(D,X,sep=":")] #interaction - - ## # variance - if (is.null(vartype)==TRUE) { - vartype<-"homoscedastic" - } - if (vartype=="homoscedastic") { - v<-vcov(mod.naive) - } else if (vartype=="robust") { - require(sandwich) - v<-vcov(mod.naive,type="HC1") # White with small sample correction - } else if (vartype=="cluster") { - v<-vcovCluster(mod.naive,cluster = data[,cl]) - } else if (vartype=="pcse") { - library(pcse) - v<-pcse(mod.naive,groupN=data[,cl],groupT=data[,time],pairwise=pairwise)$vcov - } - - if (vartype=="pcse") { - var.D<-v[D,D] - var.DX<-v[paste(D,X,sep="."),paste(D,X,sep=".")] - cov.DX<-v[D,paste(D,X,sep=".")] - } else { - var.D<-v[D,D] - var.DX<-v[paste(D,X,sep=":"),paste(D,X,sep=":")] - cov.DX<-v[D,paste(D,X,sep=":")] - } - - ### make a vector of the marginal effect of D on Y as X changes - X.lvls<-as.numeric(quantile(data[,X], probs=seq(0,1,0.01))) - marg<-coef.D + coef.DX*X.lvls - marg - - ## the variance is var(B1_D) + X^2*var(B_3) + 2*inst*cov(D, X) - se<-sqrt(var.D + X.lvls^2*var.DX + 2*X.lvls*cov.DX) - df<-mod.naive$df.residual - crit<-abs(qt(.025, df=df)) # critical values - - ##make 95% confidence bands. - lb<-marg-crit*se - ub<-marg+crit*se - - ################################################## - - ## grouping by X - if (is.null(cutoffs)==TRUE) { - cuts.X<-quantile(data[,X],probs=seq(0,1,1/nbins)) - while (length(unique(cuts.X))!=nbins+1) { - nbins<-nbins-1 - cuts.X<-quantile(data[,X],probs=seq(0,1,1/nbins)) - } - } else { - cutoffs <-cutoffs[which(cutoffs>min(data[,X]) & cutoffs < max(data[,X]))] - cuts.X<- sort(unique(c(min(data[,X]),cutoffs,max(data[,X])))) - - } - groupX<-cut(data[,X],breaks=cuts.X, labels = FALSE) - groupX[which(data[,X]==min(data[,X]))]<-1 - nbins <- length(unique(groupX)) - - ## X labels - groupX2 <- cut(data[,X],breaks=cuts.X) - gp.lab = paste(Xlabel, ": ", levels(groupX2), sep="") - gp.lab[1] <- paste(Xlabel, ": [", substring(levels(groupX2)[1],2), sep = "") - - - ############## Discre#tize X ################# - - ## mid points - x0<-rep(NA,nbins) - for (i in 1:nbins) x0[i]<-median(data[which(groupX==i),X], na.rm=TRUE) - - ## create dummies for bins and interactions - ## G -- a matrix of group dummies - ## DG -- a matrix of interactions - - G<-DG<-GX<-DGX<-matrix(0,n,nbins) - for (i in 1:nbins) { - G[which(groupX==i),i]<-1 - DG[,i]<-data[,D]*G[,i] - GX[,i]<-G[,i]*(data[,X]-x0[i]) - DGX[,i]<-DG[,i]*(data[,X]-x0[i]) - } - - ## formula and esitmation - Gs<-GXs<-DGs<-DGXs<-c() - for (i in 1:nbins) { - Gs<-c(Gs,paste("G[,",i,"]",sep="")) - GXs<-c(GXs,paste("GX[,",i,"]",sep="")) - DGs<-c(DGs,paste("DG[,",i,"]",sep="")) - DGXs<-c(DGXs,paste("DGX[,",i,"]",sep="")) - } - Xf<-paste(Y,"~ -1+",paste(DGs,collapse="+"),"+",paste(DGXs,collapse="+"), - "+",paste(Gs,collapse="+"),"+",paste(GXs,collapse="+"),sep="") - if (is.null(Z)==FALSE) { - Xf<-paste(Xf,"+",paste(Z,collapse="+"),sep="") - } - mod.Xf<-as.formula(Xf) - if (is.null(weights)==TRUE) { - mod.X<-lm(mod.Xf,data=data) - } else { - mod.X<-lm(mod.Xf,data=data,weights=data[,weights]) - } - - ## coefficients and CIs - Xcoefs<-mod.X$coefficients[1:nbins] - if (vartype=="homoscedastic") { - X.v<-vcov(mod.X) - } else if (vartype=="robust") { - X.v<-vcov(mod.X,type="HC1") ## White with small sample correction - } else if (vartype=="cluster") { - X.v<-vcovCluster(mod.X,cluster=data[,cl]) - } else if (vartype=="pcse") { - if (is.null(Z)==FALSE) { - exclude<-names(which(is.na(mod.X$coefficients)==TRUE)) ## drop colinear variables - Z.ex<-setdiff(Z,exclude) - Xf<-paste(Y,"~ -1+",paste(DGs,collapse="+"),"+",paste(DGXs,collapse="+"), - "+",paste(Gs,collapse="+"),"+",paste(GXs,collapse="+"),"+",paste(Z.ex,collapse="+"),sep="") - mod.X<-lm(as.formula(Xf),data=data) - } - X.v<-pcse(mod.X,groupN=data[,cl],groupT=data[,time],pairwise=pairwise)$vcov - } - X.v<-X.v[1:nbins,1:nbins] - X.se<-sqrt(diag(as.matrix(X.v,drop=FALSE))) - X.se[which(is.na(Xcoefs))]<-NA - df.X<-mod.X$df.residual - crit.X<-abs(qt(.025, df=df.X)) - lb.X<-Xcoefs-crit.X*X.se - ub.X<-Xcoefs+crit.X*X.se - est.binning <- cbind(x0, coef=Xcoefs, se = X.se, CI_lower = lb.X, CI_upper = ub.X) - rownames(est.binning) <- gp.lab - - ################### plotting ############################### - - ## margin and label adjustment - - if (figure==TRUE) { - - if(is.null(Xlabel)==FALSE){ - x.label<-c(paste("Moderator: ", Xlabel, sep="")) - y.label<-c(paste("Marginal Effect of ",Dlabel," on ",Ylabel,sep="")) - } else { - x.label<-c(paste("Moderator: ", X, sep="")) - y.label<-c(paste("Marginal Effect of ",D," on ",Y,sep="")) - } - out<-data.frame(X.lvls,marg,lb,ub) - out.bin<-data.frame(x0,Xcoefs,lb.X,ub.X) - out.bin2<-out.bin[which(is.na(Xcoefs)==FALSE),] ## non missing part - out.bin3<-out.bin[which(is.na(Xcoefs)==TRUE),] ## missing part - errorbar.width<-(max(X.lvls)-min(X.lvls))/20 - - p1 <- ggplot() - yrange<-na.omit(c(marg,lb,ub,Xcoefs,lb.X,ub.X)) - if (is.null(ylim)==FALSE) {yrange<-c(ylim[2],ylim[1]+(ylim[2]-ylim[1])*1/6)} - maxdiff<-(max(yrange)-min(yrange)) - pos<-max(yrange)-maxdiff/20 - - ## mark zero - p1 <- p1 + geom_hline(yintercept=0,colour="white",size=2) - - ## mark the original interval - if (is.null(interval)==FALSE) { - p1<- p1 + geom_vline(xintercept=interval,colour="steelblue", - linetype=2,size=1.5) - } - - ## plotting moderator distribution - if (is.null(Xdistr) == TRUE) { - Xdistr <- "density" - } else if (Xdistr != "density" & Xdistr != "histogram" & Xdistr != "hist") { - Xdistr <- "density" - } - if (Xdistr == "density") { # density plot - - if (length(unique(data[,D]))==2) { ## binary D - - ## get densities - if (is.null(weights)==TRUE) { - de.co <- density(data[data[,D]==0,X]) - de.tr <- density(data[data[,D]==1,X]) - } else { - suppressWarnings(library(plotrix)) - suppressWarnings(de.co<-density(data[data[,D]==0,X], - weights=data[data[,D]==0,weights])) - suppressWarnings(de.tr<-density(data[data[,D]==1,X], - weights=data[data[,D]==1,weights])) - } - - ## put in data frames - deX.ymin <- min(yrange)-maxdiff/5 - deX.co <- data.frame( - x = de.co$x, - y = de.co$y/max(de.co$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - deX.tr <- data.frame( - x = de.tr$x, - y = de.tr$y/max(de.tr$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - - ## color - feed.col<-col2rgb("gray50") - col.co<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) - col.tr<-rgb(red=1, blue=0, green=0) - - ## plotting - p1 <- p1 + geom_ribbon(data = deX.co, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col.co, alpha = 0.2) + - geom_ribbon(data = deX.tr, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col.tr, alpha = 0.2) - - - - } else { ## continuous D - - ## get densities - if (is.null(weights)==TRUE) { - de <- density(data[,X]) - } else { - suppressWarnings(library(plotrix)) - suppressWarnings(de <- density(data[,X],weights=data[,weights])) - } - - ## put in data frames - deX.ymin <- min(yrange)-maxdiff/5 - deX <- data.frame( - x = de$x, - y = de$y/max(de$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - - ## color - feed.col<-col2rgb("gray50") - col<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) - - ## plotting - p1 <- p1 + geom_ribbon(data = deX, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col, alpha = 0.2) - - } - - } else { # histogram plot - - if (length(unique(data[,D]))==2) { ## binary D - - if (is.null(weights)==TRUE) { - hist.out<-hist(data[,X],breaks=80,plot=FALSE) - } else { - suppressWarnings(library(plotrix)) - suppressWarnings(hist.out<-hist(data[,X],data[,weights], - breaks=80,plot=FALSE)) - } - n.hist<-length(hist.out$mids) - dist<-hist.out$mids[2]-hist.out$mids[1] - hist.max<-max(hist.out$counts) - ## count the number of treated - count1<-rep(0,n.hist) - treat<-which(data[,D]==max(data[,D])) - for (i in 1:n.hist) { - count1[i]<-sum(data[treat,X]>=hist.out$breaks[i] & - data[treat,X]=hist.out$breaks[n.hist] & - data[treat,X]<=hist.out$breaks[n.hist+1]) - ## put in a data frame - histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), - ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, - xmin=hist.out$mids-dist/2, - xmax=hist.out$mids+dist/2, - count1=count1/hist.max*maxdiff/5+min(yrange)-maxdiff/5) - p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), - colour="gray50",alpha=0,size=0.5) + # control - geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=count1), - fill="red",colour="grey50",alpha=0.3,size=0.5) # treated - - } else { ## continuous D - hist.out<-hist(data[,X],breaks=80,plot=FALSE) - n.hist<-length(hist.out$mids) - dist<-hist.out$mids[2]-hist.out$mids[1] - hist.max<-max(hist.out$counts) - - histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), - ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, - xmin=hist.out$mids-dist/2, - xmax=hist.out$mids+dist/2) - p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), - colour="gray50",alpha=0,size=0.5) - } - } - - - ## title - if (is.null(main)==FALSE) { - p1<-p1 + ggtitle(main) + - theme(plot.title = element_text(hjust = 0.5, size=35, - lineheight=.8, face="bold")) - } - - - - ## labels: L, M, H and so on - if (nbins==3) { - p1<-p1 + annotate(geom="text", x=out.bin[1,1], y=pos, - label="L",colour="gray50",size=10) + - annotate(geom="text", x=out.bin[2,1], y=pos, - label="M",colour="gray50",size=10) + - annotate(geom="text", x=out.bin[3,1], y=pos, - label="H",colour="gray50",size=10) - } else if (nbins==4) { - p1<-p1 + annotate(geom="text", x=out.bin[1,1], y=pos, - label="L",colour="gray50",size=10) + - annotate(geom="text", x=out.bin[2,1], y=pos, - label="M1",colour="gray50",size=10) + - annotate(geom="text", x=out.bin[3,1], y=pos, - label="M2",colour="gray50",size=10) + - annotate(geom="text", x=out.bin[4,1], y=pos, - label="H",colour="gray50",size=10) - } - - ## linear plot - p1<-p1 + geom_line(data=out,aes(X.lvls,marg))+ - geom_ribbon(data=out, aes(x=X.lvls,ymin=lb,ymax=ub),alpha=0.2)+ - xlab(x.label) + ylab(y.label) - ## bin estimates - p1<-p1+ geom_errorbar(data=out.bin2, aes(x=x0, ymin=lb.X, ymax=ub.X),colour="red", - width= errorbar.width)+ - geom_point(data=out.bin2,aes(x0,Xcoefs),size=4,shape=21,fill="white",colour="red") + - theme(axis.title = element_text(size=15)) - ## in case there's non-overlap - p1<-p1+annotate(geom="text", x=out.bin3[,1], y=rep(0,dim(out.bin3)[1]), - label="NaN",colour="red") - - if (is.null(ylim)==FALSE) { - ylim2 = c(ylim[1]-(ylim[2]-ylim[1])*0.25/6, ylim[2]+(ylim[2]-ylim[1])*0.4/6) - } - if (is.null(xlim)==FALSE & is.null(ylim)==FALSE) { - p1<-p1+coord_cartesian(xlim = xlim, ylim = ylim2) - } - if (is.null(xlim)==TRUE & is.null(ylim)==FALSE) { - p1<-p1+coord_cartesian(ylim = ylim2) - } - if (is.null(xlim)==FALSE & is.null(ylim)==TRUE) { - p1<-p1+coord_cartesian(xlim = xlim) - } - - - } # end of plotting - - ## ################# testing ############################### - - ## binary treatment - btreat<-length(unique(data[,D]))==2 - - - ## variance of treatment in each group - varD<-c() - for (i in 1:nbins) { - varD<-c(varD,var(data[groupX==i,D]/mean(data[groupX==i,D]))) - } - - ## L kurtosis - library(Lmoments) - Lkurtosis <- Lmoments(data[,X],returnobject=TRUE)$ratios[4] - - ## if the signs are correct - correctOrder<-ifelse(as.numeric((Xcoefs[1]-Xcoefs[2])*(Xcoefs[2]-Xcoefs[3]))>0,TRUE,FALSE) - - ## p values - pvalue<-function(i,j){ - stat<-(Xcoefs[i]-Xcoefs[j])/sqrt(X.v[i,i]+X.v[j,j]-2*X.v[i,j]) - p<-(1-pt(abs(stat),df.X))*2 - return(p) - } - if (nbins==3) { - p.twosided<-round(c(pvalue(1,2),pvalue(2,3),pvalue(1,3)),digit=4) - names(p.twosided)<-c("p.1v2","p.2v3","p.1v3") - names(Xcoefs)<-c("X_low","X_med","X_high") - } else if (nbins==2) { - p.twosided<-round(pvalue(1,2),digit=4) - names(p.twosided)<-c("p.LvH") - names(Xcoefs)<-c("X_low","X_high") - } else if (nbins==4) { - names(Xcoefs)<-c("X_low","X_med1","X_med2","X_high") - } - - - - ############## Ward Test ##################### - - ## formula - formula0 <- paste(Y,"~",D,"+",X,"+",D,"*",X) - ## create dummies for bins and interactions - ## G -- a matrix of group dummies - ## DG -- a matrix of interactions - G<-DG<-GX<-DGX<-matrix(0,n,(nbins-1)) - for (i in 1:(nbins-1)) { - G[which(groupX==(i+1)),i]<-1 - DG[,i]<-data[,D]*G[,i] - GX[,i]<-data[,X]*G[,i] - DGX[,i]<-data[,D]*data[,X]*G[,i] - } - ## formula and esitmation - Gs<-GXs<-DGs<-DGXs<-c() - for (i in 2:nbins) { - Gs<-c(Gs,paste("G",i,sep="")) - GXs<-c(GXs,paste("GX",i,sep="")) - DGs<-c(DGs,paste("DG",i,sep="")) - DGXs<-c(DGXs,paste("DGX",i,sep="")) - } - colnames(G) <- Gs - colnames(DG) <- DGs - colnames(GX) <- GXs - colnames(DGX) <- DGXs - data.aug <- cbind.data.frame(data, G, DG, GX, DGX) - formula1<-paste(formula0, - "+",paste(Gs,collapse=" + "), - "+",paste(GXs,collapse=" + "), - "+",paste(DGs,collapse=" + "), - "+",paste(DGXs,collapse=" + ")) - if (is.null(Z)==FALSE) { - formula0 <- paste(formula0, "+",paste(Z,collapse=" + ")) - formula1 <- paste(formula1, "+",paste(Z,collapse=" + ")) - } - if (is.null(weights)==TRUE) { - mod.re<-lm(as.formula(formula0),data=data.aug) - mod.un<-lm(as.formula(formula1), data=data.aug) - } else { - mod.re<-lm(as.formula(formula0), data=data.aug, weights=data.aug[,weights]) - mod.un<-lm(as.formula(formula1), data=data.aug, weights=data.aug[,weights]) - } - ## vcov - if (is.null(vartype)==TRUE) vartype <- "homoscedastic" - if (vartype=="homoscedastic") { - v<-vcov(mod.un) - } else if (vartype=="robust") { - v<-vcov(mod.un,type="HC1") # White with small sample correction - } else if (vartype=="cluster") { - v<-vcovCluster(mod.un,cluster = data.aug[,cl]) - } else if (vartype=="pcse") { - v<-pcse(mod.un, - groupN=data.aug[,cl], - groupT=data.aug[,time], - pairwise=pairwise)$vcov - } - if (wald == TRUE) { - library(lmtest) - wald.out <- tryCatch( - p.wald <- round(waldtest(mod.re, mod.un, - test="Chisq", vcov=v)[[4]][2],4) - , error = function(e){return(NULL)} - ) - if (is.null(wald.out)==TRUE) { - p.wald <- NULL - warning("Var-cov matrix nearly sigular in the Wald test.") - } - } - - ################################## - ## storage - out<-list(est.binning = round(est.binning, 4), - binary.treatment=round(btreat,4), - bin.size = round(c(table(groupX)/length(groupX)),4), - treat.variation.byGroup=round(varD,4), - X.Lkurtosis = round(Lkurtosis,4)) - if (nbins==3) { - out<-c(out,list(correctOrder=correctOrder)) - } - if (nbins%in%c(2,3)) { - out<-c(out,list(p.twosided=p.twosided)) - } - if (wald == TRUE) { - out <- c(out, list(p.wald = p.wald)) - } - if (figure==TRUE) {out<-c(out,list(graph=p1))} - return(out) -} - -########################### - -inter.kernel <- function(Y, D, X, - Z = NULL, - weights = NULL, # weighting variable - FE = NULL, - data, - na.rm = FALSE, - CI = TRUE, - conf.level = 0.95, - cl = NULL, - neval = 50, - nboots = 200, - parallel = TRUE, - cores = 4, - seed = 02139, - bw = NULL, # bandwidth - grid = 20, # either a number of a sequence of numbers - metric = "MSPE", - Ylabel = NULL, - Dlabel = NULL, - Xlabel = NULL, - main = NULL, - xlim = NULL, - ylim = NULL, - Xdistr = "histogram", - file = NULL - ){ - - ## check input - if (is.character(Y) == FALSE) { - stop("Y is not a string.") - } else { - Y <- Y[1] - } - if (is.character(D) == FALSE) { - stop("D is not a string.") - } else { - D <- D[1] - } - if (is.character(X) == FALSE) { - stop("X is not a string.") - } else { - X <- X[1] - } - if (is.null(Z) == FALSE) { - for (i in 1:length(Z)) { - if (is.character(Z[i]) == FALSE) { - stop("Some element in Z is not a string.") - } - } - } - if (is.null(FE) == FALSE) { - for (i in 1:length(FE)) { - if (is.character(FE[i]) == FALSE) { - stop("Some element in FE is not a string.") - } - } - } - if (is.null(weights) == FALSE) { - if (is.character(weights) == FALSE) { - stop("weigths is not a string.") - } else { - weights <- weights[1] - } - } - if (is.data.frame(data) == FALSE) { - stop("Not a data frame.") - } - if (is.logical(na.rm) == FALSE & is.numeric(na.rm)==FALSE) { - stop("na.rm is not a logical flag.") - } - if (is.logical(CI) == FALSE & is.numeric(CI)==FALSE) { - stop("CI is not a logical flag.") - } - if (is.null(conf.level)==FALSE) { - if (is.numeric(conf.level)==FALSE) { - stop("conf.level should be a number between 0.5 and 1.") - } else { - if (conf.level<=0.5 | conf.level>1) { - stop("conf.level should be a number between 0.5 and 1.") - } - } - } - if (is.null(cl)==FALSE) { - if (is.character(cl) == FALSE) { - stop("cl is not a string.") - } else { - cl <- cl[1] - } - } - if (is.null(neval)==FALSE) { - if (is.numeric(neval)==FALSE) { - stop("neval is not a positive integer.") - } else { - neval <- neval[1] - if (neval%%1!=0 | neval<=0) { - stop("neval is not a positive integer.") - } - } - } - if (is.null(nboots) == FALSE) { - if (is.numeric(nboots)==FALSE) { - stop("nboots is not a positive integer.") - } else { - nboots <- nboots[1] - if (nboots%%1 != 0 | nboots < 1) { - stop("nboots is not a positive number.") - } - } - } - if (is.logical(parallel) == FALSE & is.numeric(parallel)==FALSE) { - stop("paralell is not a logical flag.") - } - if (is.numeric(cores)==FALSE) { - stop("cores is not a positive integer.") - } else { - cores <- cores[1] - if (cores%%1!= 0 | cores<=0) { - stop("cores is not a positive integer.") - } - } - if (is.null(bw)==FALSE) { - if (is.numeric(bw)==FALSE) { - stop("bw should be a positive number.") - } else { - bw <- bw[1] - } - if (bw<=0) { - stop("bw should be a positive number.") - } - } - if (is.numeric(seed)==FALSE) { - stop("seed should be a number.") - } - if (is.numeric(grid) == FALSE) { - stop("grid should be numeric.") - } else { - if (length(grid)==1) { - if (grid%%1 != 0 | grid<1) { - stop("grid is not a positive integer.") - } - } else { - grid <- grid[which(grid>0)] - } - } - if (!metric%in%c("MSPE","MAPE")) { - stop("metric should be either \"MSPE\" or \"MAPE\".") - } - if (is.null(Ylabel)==TRUE) { - Ylabel <- Y - } else { - if (is.character(Ylabel) == FALSE) { - stop("Ylabel is not a string.") - } else { - Ylabel <- Ylabel[1] - } - } - if (is.null(Dlabel)==TRUE) { - Dlabel <- D - } else { - if (is.character(Dlabel) == FALSE) { - stop("Dlabel is not a string.") - } else { - Dlabel <- Dlabel[1] - } - } - if (is.null(Xlabel)==TRUE) { - Xlabel <- X - } else { - if (is.character(Xlabel) == FALSE) { - stop("Xlabel is not a string.") - } else { - Xlabel <- Xlabel[1] - } - } - if (is.null(main)==FALSE) { - main <- main[1] - } - if (!Xdistr %in% c("hist","histogram","density")){ - stop("Xdistr must be either \"histogram\" or \"density\".") - } - if (is.null(xlim)==FALSE) { - if (is.numeric(xlim)==FALSE) { - stop("Some element in xlim is not numeric.") - } else { - if (length(xlim)!=2) { - stop("xlim must be of length 2.") - } - } - } - if (is.null(ylim)==FALSE) { - if (is.numeric(ylim)==FALSE) { - stop("Some element in ylim is not numeric.") - } else { - if (length(ylim)!=2) { - stop("ylim must be of length 2.") - } - } - } - - - ## set seed - if (is.null(seed)==FALSE) { - set.seed(seed) - } - - ## check variable name - M <- c(Y,D,X,Z,FE,cl,weights) - for (var in M){ - if((var %in% names(data))==FALSE ) - stop("Wrong variable name.") - } - - ## check missing values - if (na.rm == TRUE) { - data <- na.omit(data[,c(Y, D, X, Z, FE)]) - } else { - if (sum(is.na(data[,c(Y, D, X, Z, FE)]))>0) { - stop("Missing values. Try option na.rm = TRUE\n") - } - } - n<-dim(data)[1] - - ## check cluster - if (is.null(cl)==TRUE & is.null(FE)==FALSE) { - warnings("Fixed effects model assumed. Clustering standard errors highly recommended.") - } - - ## change fixed effect variable to factor - if (is.null(FE)==FALSE) { - if (length(FE) == 1) { - data[, FE] <- as.numeric(as.factor(data[, FE])) - } else { - data[, FE] <- sapply(data[,FE],function(vec){as.numeric(as.factor(vec))}) - } - } - - ## labels - if (is.null(Xlabel)==TRUE) {Xlabel = X} - if (is.null(Ylabel)==TRUE) {Ylabel = Y} - if (is.null(Dlabel)==TRUE) {Dlabel = D} - - ## X more than 5 values - if (length(unique(data[ ,X]))< 5) { - warning("Moderator has less than 5 values; consider a fully saturated model.") - } - - - ## preparation for parallel computing - if (parallel == TRUE & (CI == TRUE|is.null(bw))) { - require(foreach) - require(doParallel) - maxcores <- detectCores() - cores <- min(maxcores, cores) - pcl<-makeCluster(cores) - registerDoParallel(pcl) - cat("Parallel computing with", cores,"cores...\n") - } - - ## evaluation points - X.eval <- seq(min(data[,X]), max(data[,X]), length.out = neval) - - ## bandwidth selection - if(is.null(bw) == TRUE){ - CV <- 1 - if (length(grid) == 1) { - rangeX <- max(data[, X]) - min(data[, X]) - grid <- exp(seq(log(rangeX/50), log(rangeX), length.out = grid)) - } - cv.out <- crossvalidate(data = data, X.eval = X.eval, - Y = Y, D = D, X = X, Z = Z, - FE = FE, cl = cl, - weights = weights, - grid = grid, metric = metric, - parallel = parallel) - bw <- cv.out$opt.bw - } else { - CV <- 0 - } - - ## Estimates - if (CI == FALSE) { - est <- coefs(data = data, bw = bw, Y = Y, X = X, D = D, Z = Z, - FE = FE, X.eval = X.eval, weights = weights)[,c(1,3)] - } else { ## bootstrap - ## point estimate - coef <- coefs(data = data, bw = bw, Y = Y, X = X, D = D, Z = Z, - FE = FE, X.eval = X.eval, weights = weights)[,3] - ## boostrap SEs - if (is.null(cl)==FALSE) { ## find clusters - clusters<-unique(data[,cl]) - id.list<-split(1:n,data[,cl]) - } - oneboot <- function() { - if (is.null(cl)==TRUE) { - smp<-sample(1:n,n,replace=TRUE) - } else { ## block bootstrap - cluster.boot<-sample(clusters,length(clusters),replace=TRUE) - smp<-unlist(id.list[match(cluster.boot,clusters)]) - } - s<-data[smp,] - out <- coefs(data = s, bw = bw, Y = Y, X = X, D = D, Z = Z, - FE = FE, X.eval = X.eval, weights = weights)[,3] - return(out) - } - cat("Bootstrapping ...") - if (parallel==TRUE) { - suppressWarnings( - bootout <- foreach (i=1:nboots, .combine=cbind, - .export=c("oneboot","coefs"), - .inorder=FALSE) %dopar% {oneboot()} - ) - cat("\r") - } else { - bootout<-matrix(NA,length(X.eval),nboots) - for (i in 1:nboots) { - bootout[,i] <- oneboot() - if (i%%50==0) cat(i) else cat(".") - } - cat("\r") - } - ## summary - CI.lvl <- c((1-conf.level)/2, (1-(1-conf.level)/2)) - ci<-t(apply(bootout, 1, quantile, CI.lvl)) - est<-data.frame(cbind("X" = X.eval, "ME" = coef, - "SE"=apply(bootout,1,sd), - "CI_lower"=ci[,1], "CI_upper"=ci[,2])) - - } - - if (parallel == TRUE & (CI == TRUE|CV==1)) { - suppressWarnings(stopCluster(pcl)) - cat("\n") - } - - ########################################## - ## plotting - require(ggplot2) - if(is.null(Xlabel)==FALSE){ - x.label<-c(paste("Moderator: ", Xlabel, sep="")) - y.label<-c(paste("Marginal Effect of ",Dlabel," on ",Ylabel,sep="")) - } else { - x.label<-c(paste("Moderator: ", X, sep="")) - y.label<-c(paste("Marginal Effect of ",D," on ",Y,sep="")) - } - - ## mark zero - p1 <- ggplot() + geom_hline(yintercept=0,colour="white",size=2) - - ## point estimates - p1 <- p1 + geom_line(data=est,aes(X,ME)) - - ## confidence intervals - if (CI == TRUE) { - p1 <- p1 + geom_ribbon(data=est, aes(x=X,ymin=CI_lower,ymax=CI_upper),alpha=0.2) - yrange<-na.omit(c(est$CI_lower, est$CI_upper)) - } else { - yrange<-na.omit(c(est$ME)) - } - - ## axis labels - p1 <- p1 + xlab(x.label) + ylab(y.label) + theme(axis.title = element_text(size=15)) - - ## ylim - if (is.null(ylim)==FALSE) { - yrange<-c(ylim[2],ylim[1]+(ylim[2]-ylim[1])*1/6) - } - maxdiff<-(max(yrange)-min(yrange)) - - ## plotting moderator distribution - if (is.null(Xdistr) == TRUE) { - Xdistr <- "density" - } else if (Xdistr != "density" & Xdistr != "histogram" & Xdistr != "hist") { - Xdistr <- "density" - } - if (Xdistr == "density") { # density plot - - if (length(unique(data[,D]))==2) { ## binary D - - ## get densities - de.co <- density(data[data[,D]==0,X]) - de.tr <- density(data[data[,D]==1,X]) - - ## put in data frames - deX.ymin <- min(yrange)-maxdiff/5 - deX.co <- data.frame( - x = de.co$x, - y = de.co$y/max(de.co$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - deX.tr <- data.frame( - x = de.tr$x, - y = de.tr$y/max(de.tr$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - - ## color - feed.col<-col2rgb("gray50") - col.co<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) - col.tr<-rgb(red=1, blue=0, green=0) - - ## plotting - p1 <- p1 + geom_ribbon(data = deX.co, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col.co, alpha = 0.2) + - geom_ribbon(data = deX.tr, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col.tr, alpha = 0.2) - - - } else { ## continuous D - - ## get densities - de <- density(data[,X]) - - ## put in data frames - deX.ymin <- min(yrange)-maxdiff/5 - deX <- data.frame( - x = de$x, - y = de$y/max(de$y) * maxdiff/5 + min(yrange) - maxdiff/5 - ) - - ## color - feed.col<-col2rgb("gray50") - col<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) - - ## plotting - p1 <- p1 + geom_ribbon(data = deX, - aes(x = x, ymax = y, ymin = deX.ymin), - fill = col, alpha = 0.2) - - } - - } else { # histogram plot - - if (length(unique(data[,D]))==2) { ## binary D - - hist.out<-hist(data[,X],breaks=80,plot=FALSE) - n.hist<-length(hist.out$mids) - dist<-hist.out$mids[2]-hist.out$mids[1] - hist.max<-max(hist.out$counts) - ## count the number of treated - count1<-rep(0,n.hist) - treat<-which(data[,D]==max(data[,D])) - for (i in 1:n.hist) { - count1[i]<-sum(data[treat,X]>=hist.out$breaks[i] & - data[treat,X]=hist.out$breaks[n.hist] & - data[treat,X]<=hist.out$breaks[n.hist+1]) - ## put in a data frame - histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), - ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, - xmin=hist.out$mids-dist/2, - xmax=hist.out$mids+dist/2, - count1=count1/hist.max*maxdiff/5+min(yrange)-maxdiff/5) - p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), - colour="gray50",alpha=0,size=0.5) + # control - geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=count1), - fill="red",colour="grey50",alpha=0.3,size=0.5) # treated - - } else { ## continuous D - hist.out<-hist(data[,X],breaks=80,plot=FALSE) - n.hist<-length(hist.out$mids) - dist<-hist.out$mids[2]-hist.out$mids[1] - hist.max<-max(hist.out$counts) - - histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), - ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, - xmin=hist.out$mids-dist/2, - xmax=hist.out$mids+dist/2) - p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), - colour="gray50",alpha=0,size=0.5) - } - } - - ## title - if (is.null(main)==FALSE) { - p1<-p1 + ggtitle(main) + - theme(plot.title = element_text(hjust = 0.5, size=35, - lineheight=.8, face="bold")) - } - - ## xlim and ylim - if (is.null(ylim)==FALSE) { - ylim2 = c(ylim[1]-(ylim[2]-ylim[1])*0.25/6, ylim[2]+(ylim[2]-ylim[1])*0.4/6) - } - if (is.null(xlim)==FALSE & is.null(ylim)==FALSE) { - p1<-p1+coord_cartesian(xlim = xlim, ylim = ylim2) - } - if (is.null(xlim)==TRUE & is.null(ylim)==FALSE) { - p1<-p1+coord_cartesian(ylim = ylim2) - } - if (is.null(xlim)==FALSE & is.null(ylim)==TRUE) { - p1<-p1+coord_cartesian(xlim = xlim) - } - - if (is.null(file)==FALSE) { - pdf(file) - plot(p1) - graphics.off() - } - ###################################################### - - output<-list(bw = bw, - est = est, - graph = p1) - - if (CV == 1) { - output <- c(output, list(CV.out = cv.out$CV.out)) - } - - return (output) -} - -###################################### -## Weighted Least-Squares -###################################### - -coefs <- function(data,bw,Y,X,D, - Z=NULL, - FE=NULL, - weights, - X.eval = NULL, - neval = 50){ - - - ## evaluation points - if (is.null(X.eval) == TRUE) { - X.eval <- seq(min(data$x), max(data$x), length.out = neval) - } - ## conditions - noZ <- is.null(Z) - noFE <- is.null(FE) - - ## survey weights - n <- dim(data)[1] - if (is.null(weights)==TRUE) { - weights <- rep(1, n) - } else { - weights <- data[,weights] - } - - ## storage - result <- matrix(NA, length(X.eval), (5 + length(Z))) - colnames(result)<-c("X","Intercept","ME","x","Dx",Z) - - ## main algorithm - if (noFE) { # no fixed effects, wls - ## formula - if (noZ) { ## no covariates - formula <- as.formula("y ~ d + xx + d_xx") - } else { ## with covariates - formula<- as.formula(paste("y ~ d + xx + d_xx + ",paste(Z,collapse="+"))) - } - ## weighted least squares without fixed effect - wls<-function(x, dat, weights){ - dat1<-dat - dat1$xx<-dat1$x-x - dat1$d_xx <- dat1$d * dat1$xx - dat1$w <- dnorm(dat1$xx/bw) * weights - reg <- lm(formula, data=dat1, weights = w) - result <- c(x, reg$coef) - result[which(is.na(result))] <- 0 - return(result) - } ## end of WLS function - - ## data to be used - dat<-data[, c(Y, D, X, Z)] - colnames(dat)[1:3] <- c("y","d","x") - ## estimation (a loop) - for (i in 1:length(X.eval)) { - result[i,] <- wls(X.eval[i], dat, weights = weights) - } - result <- data.frame(result) - } else{ # with fixed effects - ## data to be used - dat <- data[, c(Y, D, X, Z)] - dat.FE <- as.matrix(data[,FE],drop = FALSE) - for (i in 1:length(X.eval)) { - xx <- dat[,X]-X.eval[i] - dat1 <- as.matrix(cbind(dat[,Y],dat[,D], - xx, dat[,D] * xx, dat[,Z])) - w<-dnorm(xx/bw) * weights - estcoef <- fastplm(data = dat1, FE = dat.FE, - weight = w, FEcoefs = 0)$coefficients - estcoef[which(is.nan(estcoef))] <- 0 - result[i,] <- c(X.eval[i],0,estcoef) - } - result <- data.frame(result) - } - return(result) -} - - -###################################### -## Cross-validation -###################################### - -crossvalidate <- function(data, Y, D, X, Z = NULL, weights = NULL, - FE = NULL, cl = NULL, - X.eval, grid, nfold = 5, parallel = FALSE, - metric = "MSPE"){ - - - ## calculate error for testing set - getError <- function(train, test, bw, Y, X, D, Z, FE, weights, X.eval){ - - coef<-coefs(data=train,bw=bw,Y=Y,X=X,D=D,Z=Z,FE=FE, - weights = weights, X.eval= X.eval) - coef[is.na(coef)] <- 0 - n2<-length(Z) - esCoef<-function(x){ - Xnew<-abs(X.eval-x) - d1<-min(Xnew) ## distance between x[i] and the first nearest x in training set - label1<-which.min(Xnew) - Xnew[label1]<-Inf - d2<-min(Xnew) ## distance between x[i] and the second nearest x in training set - label2<-which.min(Xnew) - if(d1==0){ - func <- coef[label1,-c(1,4,5)] # X.eval (1), intercept (2), d (3), xx (4), d:xx (5), z - } else if(d2==0){ - func <- coef[label2,-c(1,4,5)] - } else{ ## weighted average - func <- (coef[label1,-c(1,4,5)] * d2 + coef[label2,-c(1,4,5)] * d1)/(d1 + d2) - } - return(func) - } - Knn<-t(sapply(test[,X],esCoef)) ## coefficients for test class==matrix - - ## predicting - test.Y <- test[,Y] - test.X <- as.matrix(cbind(rep(1,dim(test)[1]),test[,c(D,Z)])) - sumOfEst<-matrix(lapply(1:length(test.X), function(i){test.X[i]*Knn[[i]]}), - nrow=nrow(test.X), ncol=ncol(test.X)) - error<-test.Y - rowSums(matrix(unlist(sumOfEst),length(test.Y))) - return(c(mean(abs(error)),mean(error^2))) - } - ## grid search and 5 fold cross validation - cv<-function(bw){ - mse<-matrix(NA,nfold,2) - for(j in 1:nfold){ # 5-fold CV - testid <- which(fold==j) - mse[j,] <- getError(train= data[-testid,], test = data[testid, ], - bw = bw, Y=Y, X=X, D=D, Z=Z, FE=FE, weights = weights, X.eval= X.eval) - } - return(c(bw, apply(mse,2,mean))) - } - - - cat("Cross-validating bandwidth ... ") - ## generate 5 random folds - ## if is.null(cluster)==false then fold by cluster - n<-dim(data)[1] - fold<-rep(0,n) - - if(is.null(cl)==TRUE){ - fold <- c(0:(n-1))%%nfold + 1 - fold<-sample(fold, n, replace = FALSE) - } else{ - clusters<-unique(data[,cl]) - m <- length(clusters) - id.list<-split(1:n,data[,cl]) - cl.fold <- c(0:(m-1))%%nfold + 1 - cl.fold <- sample(cl.fold, m, replace = FALSE) - for (i in 1:m) { - id.list[[i]] <- rep(cl.fold[i],length(id.list[[i]])) - } - fold <- unlist(id.list) - } - - ## preprocessing if there's fixed effects - if (is.null(FE)==FALSE) { # with FE, first demean test data - for (var in c(Y,D,Z)) { - data[,var] <- fastplm(data = as.matrix(data[, var]), - FE = as.matrix(data[, FE],drop = FALSE), - weight = rep(1, n), - FEcoefs = 0)$residuals - } - } - ## calculation - if (parallel == TRUE) { - Error<-suppressWarnings(foreach(bw = grid, .combine = rbind, - .export = c("coefs","getError"), - .inorder = FALSE) %dopar% {cv(bw)}) - } else { - Error <- matrix(NA, length(grid), 3) - for (i in 1:length(grid)) { - Error[i, ] <- cv(grid[i]) - cat(".") - } - } - colnames(Error) <- c("bandwidth","MAPE","MSPE") - rownames(Error) <- NULL - - if (metric=="MAPE") { - opt.bw <- grid[which.min(Error[,2])] - } else { - opt.bw <- grid[which.min(Error[,3])] - } - output <- list(CV.out = round(Error,3), - opt.bw = opt.bw) - cat(paste("Bandwidth =", round(output$opt.bw,3),"\n")) - return(output) -} - - -############# - -## vcovCluster.r -## function to compute var-cov matrix using clustered robust standard errors -## inputs: -## model = model object from call to lm or glm -## cluster = vector with cluster ID indicators -## output: -## cluster robust var-cov matrix -## to call this for a model directly use: -## coeftest(model,vcov = vcovCluster(model, cluster)) -## formula is similar to Stata's , cluster command - -vcovCluster <- function( - model, - cluster -) -{ - require(sandwich) - ## require(lmtest) - - if(nrow(model.matrix(model))!=length(cluster)){ - stop("check your data: cluster variable has different N than model") - } - M <- length(unique(cluster)) - N <- length(cluster) - K <- model$rank - ## if(M<50){ - ## warning("Fewer than 50 clusters, variances may be unreliable (could try block bootstrap instead).") - ## } - dfc <- (M/(M - 1)) * ((N - 1)/(N - K)) - uj <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum)); - rcse.cov <- dfc * sandwich(model, meat = crossprod(uj)/N) - ##colnames(rcse.cov)<-rownames(rcse.cov)<-names(model$coefficients) - return(rcse.cov) -} - +## Jens Hainmueller; Jonathan Mummolo; Yiqing Xu +## This Version: 2015.12.05 + +###### Interpreting Interaction Models ####### + +## 1. inter.raw: first look at the data: D, X, Y +## 2. inter.gam: GAM plots +## 3. inter.binning: estimat +## 4. inter.kernel: kernel smooth plot + +## Supporting +## 1. coefs: kernel weighted least squares +## 2. crossvalidation: cross validate kernel bandwidth +## 3. vcovCluster: clustered standard error + + +################################################# +coefs <- NULL +crossvalidate <- NULL +inter.binning <- NULL +inter.kernel <- NULL +inter.raw <- NULL +vcovCluster <- NULL +################################################# + +inter.raw<-function(Y,D,X,weights=NULL,data, + nbins=3,cutoffs=NULL, span=NULL, + Ylabel=NULL,Dlabel= NULL,Xlabel=NULL, + pos=NULL){ + + ## Y: outcome + ## D: "treatment" indicator + ## X: covariate to be interacted with D + ## nbins: ## of bins + ## cutoffs: specified cut-points + ## span: bandwidth for loess + + ## check input + if (is.character(Y) == FALSE) { + stop("Y is not a string.") + } else { + Y <- Y[1] + } + if (is.character(D) == FALSE) { + stop("D is not a string.") + } else { + D <- D[1] + } + if (is.character(X) == FALSE) { + stop("X is not a string.") + } else { + X <- X[1] + } + if (is.null(weights) == FALSE) { + if (is.character(weights) == FALSE) { + stop("weigths is not a string.") + } else { + weights <- weights[1] + } + } + if (is.data.frame(data) == FALSE) { + stop("Not a data frame.") + } + if (is.null(nbins) == FALSE) { + if (nbins%%1 != 0) { + stop("nbins is not a positive integer.") + } else { + nbins <- nbins[1] + } + if (nbins < 1) { + stop("nbins is not a positive integer.") + } + } + if (is.null(cutoffs) == FALSE) { + if (is.numeric(cutoffs) == FALSE) { + stop("Some element of cutoffs is not numeric.") + } + } + if (is.null(span) == FALSE) { + if (is.numeric(span) == FALSE) { + stop("span is not numeric.") + } else { + span <- span[1] + if (span <= 0) { + stop("span is not a positive number.") + } + } + } + if (is.null(Ylabel)==TRUE) { + Ylabel <- Y + } else { + if (is.character(Ylabel) == FALSE) { + stop("Ylabel is not a string.") + } else { + Ylabel <- Ylabel[1] + } + } + if (is.null(Dlabel)==TRUE) { + Dlabel <- D + } else { + if (is.character(Dlabel) == FALSE) { + stop("Dlabel is not a string.") + } else { + Dlabel <- Dlabel[1] + } + } + if (is.null(Xlabel)==TRUE) { + Xlabel <- X + } else { + if (is.character(Xlabel) == FALSE) { + stop("Xlabel is not a string.") + } else { + Xlabel <- Xlabel[1] + } + } + + ## load packages + requireNamespace("ggplot2") + + ## drop missing values + data <- na.omit(data[,c(Y, D, X)]) + + ## ploting + if (length(unique(data[,D]))==2) { ## binary case + + ## plotting + treat.lab<-c(paste(Dlabel,"= 0"),paste(Dlabel,"= 1")) + data.aug<-data + data.aug$treat<-factor(data[,D], labels=treat.lab) + + if (is.null(pos)==TRUE) { + box.pos.co <- min(data[,Y]) + box.pos.tr <- max(data[,Y]) + } else { + box.pos.co <- pos[1] + box.pos.tr <- pos[2] + } + + tr <- which(data[,D]==1) + co <- which(data[,D]==0) + qt90.tr <- quantile(data[tr,X],c(0.05,0.95)) + qt90.co <- quantile(data[co,X],c(0.05,0.95)) + qt50.tr <- quantile(data[tr,X],c(0.25,0.75)) + qt50.co <- quantile(data[co,X],c(0.25,0.75)) + med.tr <- median(data[tr,X]) + med.co <- median(data[co,X]) + + data.aug$qt90 <- NA + data.aug$qt90[which(data.aug[,D]==1 & + data.aug[,X]>=qt90.tr[1] & + data.aug[,X]<=qt90.tr[2])]<- box.pos.tr + data.aug$qt90[which(data.aug[,D]==0 & + data.aug[,X]>=qt90.co[1] & + data.aug[,X]<=qt90.co[2])]<- box.pos.co + data.aug$qt50 <- NA + data.aug$qt50[which(data.aug[,D]==1 & + data.aug[,X]>=qt50.tr[1] & + data.aug[,X]<=qt50.tr[2])]<- box.pos.tr + data.aug$qt50[which(data.aug[,D]==0 & + data.aug[,X]>=qt50.co[1] & + data.aug[,X]<=qt50.co[2])]<- box.pos.co + data.aug$med <- NA + data.aug$med[tr[which.min(abs(data.aug[tr,X]-med.tr))]]<- box.pos.tr + data.aug$med[co[which.min(abs(data.aug[co,X]-med.co))]]<- box.pos.co + + + ## plotting + if (is.null(weights)==TRUE) { + p1 <- ggplot(data.aug, aes_string(X, Y)) + } else { + p1 <- ggplot(data.aug, aes_string(X, Y, weight=weights)) + + } + p1 <- p1 + geom_point() + + geom_smooth(method = "lm", se = F, fullrange = T, + colour = "steelblue", size = 1) + + if (is.null(span)==TRUE) { + p1 <- p1+ geom_smooth(method = "loess", formula = y ~ x, + se = F, colour="red") + } else { + p1 <- p1 + geom_smooth(method = "loess", formula = y ~ x, + se = F, colour="red",span=span) + } + p1 <- p1 + xlab(Xlabel) + ylab(Ylabel) + + + p1 <- p1 + geom_line(aes_string(X,"qt90"), + size=1,colour="grey50", na.rm = TRUE) + p1 <- p1 + geom_line(aes_string(X,"qt50"), + size=3,colour="grey50", na.rm = TRUE) + p1 <- p1 + geom_point(aes_string(X,"med"),size=3, + shape=21,fill="white",colour="red", na.rm = TRUE) + + p1 <- p1 + theme(axis.title = element_text(size=15)) + + p1 <- p1 + facet_wrap(~treat, ncol=1) + #p1 <- p1 + facet_grid(treat ~.) + + + + } else { # continuous case + + ## grouping by X + if (is.null(cutoffs)==TRUE) { + cutoff<-quantile(data[,X],probs=seq(0,1,1/nbins)) + while (length(unique(cutoff))!=nbins+1) { + nbins<-nbins-1 + cutoff<-quantile(data[,X],probs=seq(0,1,1/nbins)) + } + } else { + cutoffs <-cutoffs[which(cutoffs>min(data[,X]) & cutoffs < max(data[,X]))] + cutoff<- sort(unique(c(min(data[,X]),cutoffs,max(data[,X])))) + } + groupID<-cut(data[,X],breaks=cutoff, labels = FALSE) + groupID[which(data[,X]==min(data[,X]))]<-1 + + ## X labels + groupID2 <- cut(data[,X],breaks=cutoff) + gp.lab = paste(Xlabel, ": ", levels(groupID2), sep="") + gp.lab[1] <- paste(Xlabel, ": [", substring(levels(groupID2)[1],2), sep = "") + nbins <- length(unique(groupID)) + + ## if (nbins==2) { + ## gp.lab<-c(paste(Xlabel,": low",sep=""),paste(Xlabel,": high",sep="")) + ## } else if (nbins==3) { + ## gp.lab<-c(paste(Xlabel,": low",sep=""),paste(Xlabel,": medium",sep=""), + ## paste(Xlabel,": high",sep="")) + ## } else { + ## gp.lab<-c(); + ## for (i in 1:nbins) { + ## gp.lab<-c(gp.lab,paste("Grp",i)) + ## } + ## } + + groupID <- factor(groupID, labels=gp.lab) + data.aug <- data + data.aug$groupID<-groupID + + ## plotting + if (is.null(weights)==TRUE) { + p1 <- ggplot(data.aug, aes_string(D, Y)) + } else { + p1 <- ggplot(data.aug, aes_string(D, Y,weight=weights)) + } + p1 <- p1 + geom_point() + + geom_smooth(method = "lm", se = F, fullrange = T, + colour = "steelblue", size = 1) + if (is.null(span)==TRUE) { + p1 <- p1 + + geom_smooth(method = "loess", formula = y ~ x, + se = F, colour="red") + } else { + p1 <- p1 + + geom_smooth(method = "loess", formula = y ~ x, + se = F, colour="red",span=span) + } + + p1 <- p1 + xlab(Dlabel) + ylab(Ylabel) + facet_grid(.~groupID) + + } + return(list(graph = p1)) +} + + +############################################################# +## GAM + + +inter.gam<-function(Y,D,X,Z=NULL,weights=NULL,FE=NULL,data, + SE = FALSE, + k=10, + angle=c(30,100,-30,-120), + Ylabel = NULL, Dlabel = NULL, Xlabel = NULL){ + + ## check input + if (is.character(Y) == FALSE) { + stop("Y is not a string.") + } else { + Y <- Y[1] + } + if (is.character(D) == FALSE) { + stop("D is not a string.") + } else { + D <- D[1] + } + if (is.character(X) == FALSE) { + stop("X is not a string.") + } else { + X <- X[1] + } + if (is.null(Z) == FALSE) { + for (i in 1:length(Z)) { + if (is.character(Z[i]) == FALSE) { + stop("Some element in Z is not a string.") + } + } + } + if (is.null(FE) == FALSE) { + for (i in 1:length(FE)) { + if (is.character(FE[i]) == FALSE) { + stop("Some element in FE is not a string.") + } + } + } + if (is.null(weights) == FALSE) { + if (is.character(weights) == FALSE) { + stop("weigths is not a string.") + } else { + weights <- weights[1] + } + } + if (is.data.frame(data) == FALSE) { + stop("Not a data frame.") + } + if (is.logical(SE) == FALSE & is.numeric(SE)==FALSE) { + stop("SE is not a logical flag.") + } + if (is.null(k) == FALSE) { + if (is.numeric(k) == FALSE) { + stop("k is not numeric.") + } else { + k <- k[1] + if (k <= 0) { + stop("k is not a positive number.") + } + } + } + if (length(angle)>=5 | length(angle)<1) { + stop("angle must have length 1 to 4.") + } else { + if (is.numeric(angle)==FALSE) { + stop("Some element in angle is not numeric.") + } + } + if (is.null(Ylabel)==TRUE) { + Ylabel <- Y + } else { + if (is.character(Ylabel) == FALSE) { + stop("Ylabel is not a string.") + } else { + Ylabel <- Ylabel[1] + } + } + if (is.null(Dlabel)==TRUE) { + Dlabel <- D + } else { + if (is.character(Dlabel) == FALSE) { + stop("Dlabel is not a string.") + } else { + Dlabel <- Dlabel[1] + } + } + if (is.null(Xlabel)==TRUE) { + Xlabel <- X + } else { + if (is.character(Xlabel) == FALSE) { + stop("Xlabel is not a string.") + } else { + Xlabel <- Xlabel[1] + } + } + if (length(unique(data[,D]))==2) { + warning("The treatment variable is dichotomous.") + } + + ## drop missing values + data <- na.omit(data[,c(Y, D, X, Z)]) + + requireNamespace("mgcv") + if (is.null(FE)==FALSE) { + if (is.null(Z)==TRUE) {Z<-c()} + for (i in 1:length(FE)) { + Z<-c(Z,paste("as.factor(",FE[i],")",sep="")) + } + } + + if (is.null(Z)==TRUE) { # no controls + formula<-as.formula(paste(Y,"~","s(",D,",",X,",k=",k,")")) + } else { + formula<-as.formula(paste(Y,"~","s(",D,",",X,",k=",k,")+", + paste(Z,collapse="+"))) + } + if (is.null(weights)==TRUE) { + model<-gam(formula, data=data) + } else { + model<-gam(formula, data=data,weights=data[,weights]) + } + par(mfrow=c(2,2),mar=c(2,2,0,0)) + if (SE==0) { + for (i in angle) { + vis.gam(model, view=c(D,X), type="response",cex.lab=1.5, + xlab = Dlabel, ylab = Xlabel, zlab=Ylabel, + ticktype="detailed",plot.type="persp",n.grid=40,too.far=0.5,theta=i,phi=20) + } + } else { + for (i in angle) { + vis.gam(model, view=c(D, X), type="response",cex.lab=1.5, + xlab = Dlabel, ylab = Xlabel, zlab=Ylabel, + ticktype="detailed",plot.type="persp",se=2,n.grid=40,too.far=0.5,theta=30,phi=20) + } + } +} + + +#################################################### + +inter.binning<-function( + Y, # outcome + D, # treatment indicator + X, # moderator + Z = NULL, # covariates + FE = NULL, # fixed effects + weights = NULL, # weigthing variable + data, + na.rm = FALSE, + nbins = 3, # No. of X bins + cutoffs = NULL, + vartype = "homoscedastic", # variance type + ## "homoscedastic" (default); "robust"; "cluster", "pcse" + cl = NULL, # variable to be clustered on + time = NULL, # time variable for pcse + pairwise = TRUE, # pcse option + figure = TRUE, + main = NULL, + Ylabel = NULL, + Dlabel = NULL, + Xlabel = NULL, + xlim = NULL, + ylim = NULL, + interval = NULL, + Xdistr = "histogram", # c("density","histogram") + wald = TRUE + ){ + + x <- NULL + y <- NULL + xmin <- NULL + xmax <- NULL + ymin <- NULL + ymax <- NULL + ## check input + if (is.character(Y) == FALSE) { + stop("Y is not a string.") + } else { + Y <- Y[1] + } + if (is.character(D) == FALSE) { + stop("D is not a string.") + } else { + D <- D[1] + } + if (is.character(X) == FALSE) { + stop("X is not a string.") + } else { + X <- X[1] + } + if (is.null(Z) == FALSE) { + for (i in 1:length(Z)) { + if (is.character(Z[i]) == FALSE) { + stop("Some element in Z is not a string.") + } + } + } + if (is.null(FE) == FALSE) { + for (i in 1:length(FE)) { + if (is.character(FE[i]) == FALSE) { + stop("Some element in FE is not a string.") + } + } + } + if (is.null(weights) == FALSE) { + if (is.character(weights) == FALSE) { + stop("weigths is not a string.") + } else { + weights <- weights[1] + } + } + if (is.data.frame(data) == FALSE) { + stop("Not a data frame.") + } + if (is.logical(na.rm) == FALSE & is.numeric(na.rm)==FALSE) { + stop("na.rm is not a logical flag.") + } + if (is.null(nbins) == FALSE) { + if (nbins%%1 != 0) { + stop("nbins is not a positive integer.") + } else { + nbins <- nbins[1] + } + if (nbins < 1) { + stop("nbins is not a positive integer.") + } + } + if (is.null(cutoffs) == FALSE) { + if (is.numeric(cutoffs) == FALSE) { + stop("Some element of cutoffs is not numeric.") + } + } + if (is.null(vartype)==TRUE) { + vartype <- "homoscedastic" + } + if (!vartype %in% c("homoscedastic","robust","cluster","pcse")){ + stop("vartype must be one of the following: \"homoscedastic\",\"robust\",\"cluster\",\"pcse\".") + } else if (vartype == "cluster") { + if (is.null(cl)==TRUE) { + stop("cl not specified; set cl = \"varname\".") + } + } else if (vartype == "pcse") { + if (is.null(cl)==TRUE | is.null(time)==TRUE) { + stop("cl or time not specified; set cl = \"varname1\", time = \"varname2\":.") + } + } + if (is.null(cl)==FALSE) { + if (is.character(cl) == FALSE) { + stop("cl is not a string.") + } else { + cl <- cl[1] + if (vartype != "pcse") { + vartype <- "cluster" + } + } + } + if (is.null(time)==FALSE) { + if (is.character(time) == FALSE) { + stop("time is not a string.") + } else { + time <- time[1] + } + } + if (is.logical(pairwise) == FALSE & is.numeric(pairwise)==FALSE) { + stop("pairwise is not a logical flag.") + } + if (is.null(Ylabel)==TRUE) { + Ylabel <- Y + } else { + if (is.character(Ylabel) == FALSE) { + stop("Ylabel is not a string.") + } else { + Ylabel <- Ylabel[1] + } + } + if (is.null(Dlabel)==TRUE) { + Dlabel <- D + } else { + if (is.character(Dlabel) == FALSE) { + stop("Dlabel is not a string.") + } else { + Dlabel <- Dlabel[1] + } + } + if (is.null(Xlabel)==TRUE) { + Xlabel <- X + } else { + if (is.character(Xlabel) == FALSE) { + stop("Xlabel is not a string.") + } else { + Xlabel <- Xlabel[1] + } + } + if (is.null(main)==FALSE) { + main <- main[1] + } + if (!Xdistr %in% c("hist","histogram","density")){ + stop("Xdistr must be either \"histogram\" or \"density\".") + } + if (is.null(xlim)==FALSE) { + if (is.numeric(xlim)==FALSE) { + stop("Some element in xlim is not numeric.") + } else { + if (length(xlim)!=2) { + stop("xlim must be of length 2.") + } + } + } + if (is.null(ylim)==FALSE) { + if (is.numeric(ylim)==FALSE) { + stop("Some element in ylim is not numeric.") + } else { + if (length(ylim)!=2) { + stop("ylim must be of length 2.") + } + } + } + + ## check missing values + if (na.rm == TRUE) { + data <- na.omit(data[,c(Y, D, X, Z, FE)]) + } else { + if (sum(is.na(data[,c(Y, D, X, Z, FE)]))>0) { + stop("Missing values. Try option na.rm = TRUE\n") + } + } + + ################################# + + requireNamespace("ggplot2") + n<-dim(data)[1] + data[,D]<-as.numeric(data[,D]) + + ## parsing fixed effects + if (is.null(FE)==FALSE) { + if (is.null(Z)==TRUE) {Z<-c()} + for (i in 1:length(FE)) { + Z<-c(Z,paste("as.factor(",FE[i],")",sep="")) + } + } + + ## a naive fit + if (is.null(Z)==FALSE) { + mod.f<-as.formula(paste(Y,"~",D,"+",X,"+",D,"*",X,"+",paste(Z,collapse="+"),sep="")) + } else { + mod.f<-as.formula(paste(Y,"~",D,"+",X,"+",D,"*",X,sep="")) + } + if (is.null(weights)==TRUE) { + mod.naive<-lm(mod.f,data=data) + } else { + mod.naive<-lm(mod.f,data=data,weights=data[,weights]) + } + ## coefficients + coefs<-summary(mod.naive)$coefficients[,1] + coef.D<-coefs[D] + coef.X<-coefs[X] + coef.DX<-coefs[paste(D,X,sep=":")] #interaction + + ## # variance + if (is.null(vartype)==TRUE) { + vartype<-"homoscedastic" + } + if (vartype=="homoscedastic") { + v<-vcov(mod.naive) + } else if (vartype=="robust") { + requireNamespace("sandwich") + v<-vcov(mod.naive,type="HC1") # White with small sample correction + } else if (vartype=="cluster") { + v<-vcovCluster(mod.naive,cluster = data[,cl]) + } else if (vartype=="pcse") { + requireNamespace("pcse") + v<-pcse(mod.naive,groupN=data[,cl],groupT=data[,time],pairwise=pairwise)$vcov + } + + if (vartype=="pcse") { + var.D<-v[D,D] + var.DX<-v[paste(D,X,sep="."),paste(D,X,sep=".")] + cov.DX<-v[D,paste(D,X,sep=".")] + } else { + var.D<-v[D,D] + var.DX<-v[paste(D,X,sep=":"),paste(D,X,sep=":")] + cov.DX<-v[D,paste(D,X,sep=":")] + } + + ### make a vector of the marginal effect of D on Y as X changes + X.lvls<-as.numeric(quantile(data[,X], probs=seq(0,1,0.01))) + marg<-coef.D + coef.DX*X.lvls + marg + + ## the variance is var(B1_D) + X^2*var(B_3) + 2*inst*cov(D, X) + se<-sqrt(var.D + X.lvls^2*var.DX + 2*X.lvls*cov.DX) + df<-mod.naive$df.residual + crit<-abs(qt(.025, df=df)) # critical values + + ##make 95% confidence bands. + lb<-marg-crit*se + ub<-marg+crit*se + + ################################################## + + ## grouping by X + if (is.null(cutoffs)==TRUE) { + cuts.X<-quantile(data[,X],probs=seq(0,1,1/nbins)) + while (length(unique(cuts.X))!=nbins+1) { + nbins<-nbins-1 + cuts.X<-quantile(data[,X],probs=seq(0,1,1/nbins)) + } + } else { + cutoffs <-cutoffs[which(cutoffs>min(data[,X]) & cutoffs < max(data[,X]))] + cuts.X<- sort(unique(c(min(data[,X]),cutoffs,max(data[,X])))) + + } + groupX<-cut(data[,X],breaks=cuts.X, labels = FALSE) + groupX[which(data[,X]==min(data[,X]))]<-1 + nbins <- length(unique(groupX)) + + ## X labels + groupX2 <- cut(data[,X],breaks=cuts.X) + gp.lab = paste(Xlabel, ": ", levels(groupX2), sep="") + gp.lab[1] <- paste(Xlabel, ": [", substring(levels(groupX2)[1],2), sep = "") + + + ############## Discre#tize X ################# + + ## mid points + x0<-rep(NA,nbins) + for (i in 1:nbins) x0[i]<-median(data[which(groupX==i),X], na.rm=TRUE) + + ## create dummies for bins and interactions + ## G -- a matrix of group dummies + ## DG -- a matrix of interactions + + G<-DG<-GX<-DGX<-matrix(0,n,nbins) + for (i in 1:nbins) { + G[which(groupX==i),i]<-1 + DG[,i]<-data[,D]*G[,i] + GX[,i]<-G[,i]*(data[,X]-x0[i]) + DGX[,i]<-DG[,i]*(data[,X]-x0[i]) + } + + ## formula and esitmation + Gs<-GXs<-DGs<-DGXs<-c() + for (i in 1:nbins) { + Gs<-c(Gs,paste("G[,",i,"]",sep="")) + GXs<-c(GXs,paste("GX[,",i,"]",sep="")) + DGs<-c(DGs,paste("DG[,",i,"]",sep="")) + DGXs<-c(DGXs,paste("DGX[,",i,"]",sep="")) + } + Xf<-paste(Y,"~ -1+",paste(DGs,collapse="+"),"+",paste(DGXs,collapse="+"), + "+",paste(Gs,collapse="+"),"+",paste(GXs,collapse="+"),sep="") + if (is.null(Z)==FALSE) { + Xf<-paste(Xf,"+",paste(Z,collapse="+"),sep="") + } + mod.Xf<-as.formula(Xf) + if (is.null(weights)==TRUE) { + mod.X<-lm(mod.Xf,data=data) + } else { + mod.X<-lm(mod.Xf,data=data,weights=data[,weights]) + } + + ## coefficients and CIs + Xcoefs<-mod.X$coefficients[1:nbins] + if (vartype=="homoscedastic") { + X.v<-vcov(mod.X) + } else if (vartype=="robust") { + X.v<-vcov(mod.X,type="HC1") ## White with small sample correction + } else if (vartype=="cluster") { + X.v<-vcovCluster(mod.X,cluster=data[,cl]) + } else if (vartype=="pcse") { + if (is.null(Z)==FALSE) { + exclude<-names(which(is.na(mod.X$coefficients)==TRUE)) ## drop colinear variables + Z.ex<-setdiff(Z,exclude) + Xf<-paste(Y,"~ -1+",paste(DGs,collapse="+"),"+",paste(DGXs,collapse="+"), + "+",paste(Gs,collapse="+"),"+",paste(GXs,collapse="+"),"+",paste(Z.ex,collapse="+"),sep="") + mod.X<-lm(as.formula(Xf),data=data) + } + X.v<-pcse(mod.X,groupN=data[,cl],groupT=data[,time],pairwise=pairwise)$vcov + } + X.v<-X.v[1:nbins,1:nbins] + X.se<-sqrt(diag(as.matrix(X.v,drop=FALSE))) + X.se[which(is.na(Xcoefs))]<-NA + df.X<-mod.X$df.residual + crit.X<-abs(qt(.025, df=df.X)) + lb.X<-Xcoefs-crit.X*X.se + ub.X<-Xcoefs+crit.X*X.se + est.binning <- cbind(x0, coef=Xcoefs, se = X.se, CI_lower = lb.X, CI_upper = ub.X) + rownames(est.binning) <- gp.lab + + ################### plotting ############################### + + ## margin and label adjustment + + if (figure==TRUE) { + + if(is.null(Xlabel)==FALSE){ + x.label<-c(paste("Moderator: ", Xlabel, sep="")) + y.label<-c(paste("Marginal Effect of ",Dlabel," on ",Ylabel,sep="")) + } else { + x.label<-c(paste("Moderator: ", X, sep="")) + y.label<-c(paste("Marginal Effect of ",D," on ",Y,sep="")) + } + out<-data.frame(X.lvls,marg,lb,ub) + out.bin<-data.frame(x0,Xcoefs,lb.X,ub.X) + out.bin2<-out.bin[which(is.na(Xcoefs)==FALSE),] ## non missing part + out.bin3<-out.bin[which(is.na(Xcoefs)==TRUE),] ## missing part + errorbar.width<-(max(X.lvls)-min(X.lvls))/20 + + p1 <- ggplot() + yrange<-na.omit(c(marg,lb,ub,Xcoefs,lb.X,ub.X)) + if (is.null(ylim)==FALSE) {yrange<-c(ylim[2],ylim[1]+(ylim[2]-ylim[1])*1/6)} + maxdiff<-(max(yrange)-min(yrange)) + pos<-max(yrange)-maxdiff/20 + + ## mark zero + p1 <- p1 + geom_hline(yintercept=0,colour="white",size=2) + + ## mark the original interval + if (is.null(interval)==FALSE) { + p1<- p1 + geom_vline(xintercept=interval,colour="steelblue", + linetype=2,size=1.5) + } + + ## plotting moderator distribution + if (is.null(Xdistr) == TRUE) { + Xdistr <- "density" + } else if (Xdistr != "density" & Xdistr != "histogram" & Xdistr != "hist") { + Xdistr <- "density" + } + if (Xdistr == "density") { # density plot + + if (length(unique(data[,D]))==2) { ## binary D + + ## get densities + if (is.null(weights)==TRUE) { + de.co <- density(data[data[,D]==0,X]) + de.tr <- density(data[data[,D]==1,X]) + } else { + suppressWarnings(de.co<-density(data[data[,D]==0,X], + weights=data[data[,D]==0,weights])) + suppressWarnings(de.tr<-density(data[data[,D]==1,X], + weights=data[data[,D]==1,weights])) + } + + ## put in data frames + deX.ymin <- min(yrange)-maxdiff/5 + deX.co <- data.frame( + x = de.co$x, + y = de.co$y/max(de.co$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + deX.tr <- data.frame( + x = de.tr$x, + y = de.tr$y/max(de.tr$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + + ## color + feed.col<-col2rgb("gray50") + col.co<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) + col.tr<-rgb(red=1, blue=0, green=0) + + ## plotting + p1 <- p1 + geom_ribbon(data = deX.co, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col.co, alpha = 0.2) + + geom_ribbon(data = deX.tr, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col.tr, alpha = 0.2) + + + + } else { ## continuous D + + ## get densities + if (is.null(weights)==TRUE) { + de <- density(data[,X]) + } else { + suppressWarnings(de <- density(data[,X],weights=data[,weights])) + } + + ## put in data frames + deX.ymin <- min(yrange)-maxdiff/5 + deX <- data.frame( + x = de$x, + y = de$y/max(de$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + + ## color + feed.col<-col2rgb("gray50") + col<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) + + ## plotting + p1 <- p1 + geom_ribbon(data = deX, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col, alpha = 0.2) + + } + + } else { # histogram plot + + if (length(unique(data[,D]))==2) { ## binary D + + if (is.null(weights)==TRUE) { + hist.out<-hist(data[,X],breaks=80,plot=FALSE) + } else { + suppressWarnings(hist.out<-hist(data[,X],data[,weights], + breaks=80,plot=FALSE)) + } + n.hist<-length(hist.out$mids) + dist<-hist.out$mids[2]-hist.out$mids[1] + hist.max<-max(hist.out$counts) + ## count the number of treated + count1<-rep(0,n.hist) + treat<-which(data[,D]==max(data[,D])) + for (i in 1:n.hist) { + count1[i]<-sum(data[treat,X]>=hist.out$breaks[i] & + data[treat,X]=hist.out$breaks[n.hist] & + data[treat,X]<=hist.out$breaks[n.hist+1]) + ## put in a data frame + histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), + ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, + xmin=hist.out$mids-dist/2, + xmax=hist.out$mids+dist/2, + count1=count1/hist.max*maxdiff/5+min(yrange)-maxdiff/5) + p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), + colour="gray50",alpha=0,size=0.5) + # control + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=count1), + fill="red",colour="grey50",alpha=0.3,size=0.5) # treated + + } else { ## continuous D + hist.out<-hist(data[,X],breaks=80,plot=FALSE) + n.hist<-length(hist.out$mids) + dist<-hist.out$mids[2]-hist.out$mids[1] + hist.max<-max(hist.out$counts) + + histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), + ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, + xmin=hist.out$mids-dist/2, + xmax=hist.out$mids+dist/2) + p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), + colour="gray50",alpha=0,size=0.5) + } + } + + + ## title + if (is.null(main)==FALSE) { + p1<-p1 + ggtitle(main) + + theme(plot.title = element_text(hjust = 0.5, size=35, + lineheight=.8, face="bold")) + } + + + + ## labels: L, M, H and so on + if (nbins==3) { + p1<-p1 + annotate(geom="text", x=out.bin[1,1], y=pos, + label="L",colour="gray50",size=10) + + annotate(geom="text", x=out.bin[2,1], y=pos, + label="M",colour="gray50",size=10) + + annotate(geom="text", x=out.bin[3,1], y=pos, + label="H",colour="gray50",size=10) + } else if (nbins==4) { + p1<-p1 + annotate(geom="text", x=out.bin[1,1], y=pos, + label="L",colour="gray50",size=10) + + annotate(geom="text", x=out.bin[2,1], y=pos, + label="M1",colour="gray50",size=10) + + annotate(geom="text", x=out.bin[3,1], y=pos, + label="M2",colour="gray50",size=10) + + annotate(geom="text", x=out.bin[4,1], y=pos, + label="H",colour="gray50",size=10) + } + + ## linear plot + p1<-p1 + geom_line(data=out,aes(X.lvls,marg))+ + geom_ribbon(data=out, aes(x=X.lvls,ymin=lb,ymax=ub),alpha=0.2)+ + xlab(x.label) + ylab(y.label) + ## bin estimates + p1<-p1+ geom_errorbar(data=out.bin2, aes(x=x0, ymin=lb.X, ymax=ub.X),colour="red", + width= errorbar.width)+ + geom_point(data=out.bin2,aes(x0,Xcoefs),size=4,shape=21,fill="white",colour="red") + + theme(axis.title = element_text(size=15)) + ## in case there's non-overlap + p1<-p1+annotate(geom="text", x=out.bin3[,1], y=rep(0,dim(out.bin3)[1]), + label="NaN",colour="red") + + if (is.null(ylim)==FALSE) { + ylim2 = c(ylim[1]-(ylim[2]-ylim[1])*0.25/6, ylim[2]+(ylim[2]-ylim[1])*0.4/6) + } + if (is.null(xlim)==FALSE & is.null(ylim)==FALSE) { + p1<-p1+coord_cartesian(xlim = xlim, ylim = ylim2) + } + if (is.null(xlim)==TRUE & is.null(ylim)==FALSE) { + p1<-p1+coord_cartesian(ylim = ylim2) + } + if (is.null(xlim)==FALSE & is.null(ylim)==TRUE) { + p1<-p1+coord_cartesian(xlim = xlim) + } + + + } # end of plotting + + ## ################# testing ############################### + + ## binary treatment + btreat<-length(unique(data[,D]))==2 + + + ## variance of treatment in each group + varD<-c() + for (i in 1:nbins) { + varD<-c(varD,var(data[groupX==i,D]/mean(data[groupX==i,D]))) + } + + ## L kurtosis + requireNamespace("Lmoments") + Lkurtosis <- Lmoments(data[,X],returnobject=TRUE)$ratios[4] + + ## if the signs are correct + correctOrder<-ifelse(as.numeric((Xcoefs[1]-Xcoefs[2])*(Xcoefs[2]-Xcoefs[3]))>0,TRUE,FALSE) + + ## p values + pvalue<-function(i,j){ + stat<-(Xcoefs[i]-Xcoefs[j])/sqrt(X.v[i,i]+X.v[j,j]-2*X.v[i,j]) + p<-(1-pt(abs(stat),df.X))*2 + return(p) + } + if (nbins==3) { + p.twosided<-round(c(pvalue(1,2),pvalue(2,3),pvalue(1,3)),digits=4) + names(p.twosided)<-c("p.1v2","p.2v3","p.1v3") + names(Xcoefs)<-c("X_low","X_med","X_high") + } else if (nbins==2) { + p.twosided<-round(pvalue(1,2),digits=4) + names(p.twosided)<-c("p.LvH") + names(Xcoefs)<-c("X_low","X_high") + } else if (nbins==4) { + names(Xcoefs)<-c("X_low","X_med1","X_med2","X_high") + } + + + + ############## Ward Test ##################### + + ## formula + formula0 <- paste(Y,"~",D,"+",X,"+",D,"*",X) + ## create dummies for bins and interactions + ## G -- a matrix of group dummies + ## DG -- a matrix of interactions + G<-DG<-GX<-DGX<-matrix(0,n,(nbins-1)) + for (i in 1:(nbins-1)) { + G[which(groupX==(i+1)),i]<-1 + DG[,i]<-data[,D]*G[,i] + GX[,i]<-data[,X]*G[,i] + DGX[,i]<-data[,D]*data[,X]*G[,i] + } + ## formula and esitmation + Gs<-GXs<-DGs<-DGXs<-c() + for (i in 2:nbins) { + Gs<-c(Gs,paste("G",i,sep="")) + GXs<-c(GXs,paste("GX",i,sep="")) + DGs<-c(DGs,paste("DG",i,sep="")) + DGXs<-c(DGXs,paste("DGX",i,sep="")) + } + colnames(G) <- Gs + colnames(DG) <- DGs + colnames(GX) <- GXs + colnames(DGX) <- DGXs + data.aug <- cbind.data.frame(data, G, DG, GX, DGX) + formula1<-paste(formula0, + "+",paste(Gs,collapse=" + "), + "+",paste(GXs,collapse=" + "), + "+",paste(DGs,collapse=" + "), + "+",paste(DGXs,collapse=" + ")) + if (is.null(Z)==FALSE) { + formula0 <- paste(formula0, "+",paste(Z,collapse=" + ")) + formula1 <- paste(formula1, "+",paste(Z,collapse=" + ")) + } + if (is.null(weights)==TRUE) { + mod.re<-lm(as.formula(formula0),data=data.aug) + mod.un<-lm(as.formula(formula1), data=data.aug) + } else { + mod.re<-lm(as.formula(formula0), data=data.aug, weights=data.aug[,weights]) + mod.un<-lm(as.formula(formula1), data=data.aug, weights=data.aug[,weights]) + } + ## vcov + if (is.null(vartype)==TRUE) vartype <- "homoscedastic" + if (vartype=="homoscedastic") { + v<-vcov(mod.un) + } else if (vartype=="robust") { + v<-vcov(mod.un,type="HC1") # White with small sample correction + } else if (vartype=="cluster") { + v<-vcovCluster(mod.un,cluster = data.aug[,cl]) + } else if (vartype=="pcse") { + v<-pcse(mod.un, + groupN=data.aug[,cl], + groupT=data.aug[,time], + pairwise=pairwise)$vcov + } + if (wald == TRUE) { + requireNamespace("lmtest") + wald.out <- tryCatch( + p.wald <- round(waldtest(mod.re, mod.un, + test="Chisq", vcov=v)[[4]][2],4) + , error = function(e){return(NULL)} + ) + if (is.null(wald.out)==TRUE) { + p.wald <- NULL + warning("Var-cov matrix nearly sigular in the Wald test.") + } + } + + ################################## + ## storage + out<-list(est.binning = round(est.binning, 4), + binary.treatment=round(btreat,4), + bin.size = round(c(table(groupX)/length(groupX)),4), + treat.variation.byGroup=round(varD,4), + X.Lkurtosis = round(Lkurtosis,4)) + if (nbins==3) { + out<-c(out,list(correctOrder=correctOrder)) + } + if (nbins%in%c(2,3)) { + out<-c(out,list(p.twosided=p.twosided)) + } + if (wald == TRUE) { + out <- c(out, list(p.wald = p.wald)) + } + if (figure==TRUE) {out<-c(out,list(graph=p1))} + return(out) +} + +########################### + +inter.kernel <- function(Y, D, X, + Z = NULL, + weights = NULL, # weighting variable + FE = NULL, + data, + na.rm = FALSE, + CI = TRUE, + conf.level = 0.95, + cl = NULL, + neval = 50, + nboots = 200, + parallel = TRUE, + cores = 4, + seed = 02139, + bw = NULL, # bandwidth + grid = 20, # either a number of a sequence of numbers + metric = "MSPE", + Ylabel = NULL, + Dlabel = NULL, + Xlabel = NULL, + main = NULL, + xlim = NULL, + ylim = NULL, + Xdistr = "histogram", + file = NULL + ){ + + x <- NULL + y <- NULL + xmin <- NULL + xmax <- NULL + ymin <- NULL + ymax <- NULL + ME <- NULL + CI_lower <- NULL + CI_upper <- NULL + ## check input + if (is.character(Y) == FALSE) { + stop("Y is not a string.") + } else { + Y <- Y[1] + } + if (is.character(D) == FALSE) { + stop("D is not a string.") + } else { + D <- D[1] + } + if (is.character(X) == FALSE) { + stop("X is not a string.") + } else { + X <- X[1] + } + if (is.null(Z) == FALSE) { + for (i in 1:length(Z)) { + if (is.character(Z[i]) == FALSE) { + stop("Some element in Z is not a string.") + } + } + } + if (is.null(FE) == FALSE) { + for (i in 1:length(FE)) { + if (is.character(FE[i]) == FALSE) { + stop("Some element in FE is not a string.") + } + } + } + if (is.null(weights) == FALSE) { + if (is.character(weights) == FALSE) { + stop("weigths is not a string.") + } else { + weights <- weights[1] + } + } + if (is.data.frame(data) == FALSE) { + stop("Not a data frame.") + } + if (is.logical(na.rm) == FALSE & is.numeric(na.rm)==FALSE) { + stop("na.rm is not a logical flag.") + } + if (is.logical(CI) == FALSE & is.numeric(CI)==FALSE) { + stop("CI is not a logical flag.") + } + if (is.null(conf.level)==FALSE) { + if (is.numeric(conf.level)==FALSE) { + stop("conf.level should be a number between 0.5 and 1.") + } else { + if (conf.level<=0.5 | conf.level>1) { + stop("conf.level should be a number between 0.5 and 1.") + } + } + } + if (is.null(cl)==FALSE) { + if (is.character(cl) == FALSE) { + stop("cl is not a string.") + } else { + cl <- cl[1] + } + } + if (is.null(neval)==FALSE) { + if (is.numeric(neval)==FALSE) { + stop("neval is not a positive integer.") + } else { + neval <- neval[1] + if (neval%%1!=0 | neval<=0) { + stop("neval is not a positive integer.") + } + } + } + if (is.null(nboots) == FALSE) { + if (is.numeric(nboots)==FALSE) { + stop("nboots is not a positive integer.") + } else { + nboots <- nboots[1] + if (nboots%%1 != 0 | nboots < 1) { + stop("nboots is not a positive number.") + } + } + } + if (is.logical(parallel) == FALSE & is.numeric(parallel)==FALSE) { + stop("paralell is not a logical flag.") + } + if (is.numeric(cores)==FALSE) { + stop("cores is not a positive integer.") + } else { + cores <- cores[1] + if (cores%%1!= 0 | cores<=0) { + stop("cores is not a positive integer.") + } + } + if (is.null(bw)==FALSE) { + if (is.numeric(bw)==FALSE) { + stop("bw should be a positive number.") + } else { + bw <- bw[1] + } + if (bw<=0) { + stop("bw should be a positive number.") + } + } + if (is.numeric(seed)==FALSE) { + stop("seed should be a number.") + } + if (is.numeric(grid) == FALSE) { + stop("grid should be numeric.") + } else { + if (length(grid)==1) { + if (grid%%1 != 0 | grid<1) { + stop("grid is not a positive integer.") + } + } else { + grid <- grid[which(grid>0)] + } + } + if (!metric%in%c("MSPE","MAPE")) { + stop("metric should be either \"MSPE\" or \"MAPE\".") + } + if (is.null(Ylabel)==TRUE) { + Ylabel <- Y + } else { + if (is.character(Ylabel) == FALSE) { + stop("Ylabel is not a string.") + } else { + Ylabel <- Ylabel[1] + } + } + if (is.null(Dlabel)==TRUE) { + Dlabel <- D + } else { + if (is.character(Dlabel) == FALSE) { + stop("Dlabel is not a string.") + } else { + Dlabel <- Dlabel[1] + } + } + if (is.null(Xlabel)==TRUE) { + Xlabel <- X + } else { + if (is.character(Xlabel) == FALSE) { + stop("Xlabel is not a string.") + } else { + Xlabel <- Xlabel[1] + } + } + if (is.null(main)==FALSE) { + main <- main[1] + } + if (!Xdistr %in% c("hist","histogram","density")){ + stop("Xdistr must be either \"histogram\" or \"density\".") + } + if (is.null(xlim)==FALSE) { + if (is.numeric(xlim)==FALSE) { + stop("Some element in xlim is not numeric.") + } else { + if (length(xlim)!=2) { + stop("xlim must be of length 2.") + } + } + } + if (is.null(ylim)==FALSE) { + if (is.numeric(ylim)==FALSE) { + stop("Some element in ylim is not numeric.") + } else { + if (length(ylim)!=2) { + stop("ylim must be of length 2.") + } + } + } + + + ## set seed + if (is.null(seed)==FALSE) { + set.seed(seed) + } + + ## check variable name + M <- c(Y,D,X,Z,FE,cl,weights) + for (var in M){ + if((var %in% names(data))==FALSE ) + stop("Wrong variable name.") + } + + ## check missing values + if (na.rm == TRUE) { + data <- na.omit(data[,c(Y, D, X, Z, FE)]) + } else { + if (sum(is.na(data[,c(Y, D, X, Z, FE)]))>0) { + stop("Missing values. Try option na.rm = TRUE\n") + } + } + n<-dim(data)[1] + + ## check cluster + if (is.null(cl)==TRUE & is.null(FE)==FALSE) { + warnings("Fixed effects model assumed. Clustering standard errors highly recommended.") + } + + ## change fixed effect variable to factor + if (is.null(FE)==FALSE) { + if (length(FE) == 1) { + data[, FE] <- as.numeric(as.factor(data[, FE])) + } else { + data[, FE] <- sapply(data[,FE],function(vec){as.numeric(as.factor(vec))}) + } + } + + ## labels + if (is.null(Xlabel)==TRUE) {Xlabel = X} + if (is.null(Ylabel)==TRUE) {Ylabel = Y} + if (is.null(Dlabel)==TRUE) {Dlabel = D} + + ## X more than 5 values + if (length(unique(data[ ,X]))< 5) { + warning("Moderator has less than 5 values; consider a fully saturated model.") + } + + + ## preparation for parallel computing + if (parallel == TRUE & (CI == TRUE|is.null(bw))) { + requireNamespace("doParallel") + ## require(iterators) + maxcores <- detectCores() + cores <- min(maxcores, cores) + pcl<-makeCluster(cores) + doParallel::registerDoParallel(pcl) + cat("Parallel computing with", cores,"cores...\n") + } + + ## evaluation points + X.eval <- seq(min(data[,X]), max(data[,X]), length.out = neval) + + ## bandwidth selection + if(is.null(bw) == TRUE){ + CV <- 1 + if (length(grid) == 1) { + rangeX <- max(data[, X]) - min(data[, X]) + grid <- exp(seq(log(rangeX/50), log(rangeX), length.out = grid)) + } + cv.out <- crossvalidate(data = data, X.eval = X.eval, + Y = Y, D = D, X = X, Z = Z, + FE = FE, cl = cl, + weights = weights, + grid = grid, metric = metric, + parallel = parallel) + bw <- cv.out$opt.bw + } else { + CV <- 0 + } + + ## Estimates + if (CI == FALSE) { + est <- coefs(data = data, bw = bw, Y = Y, X = X, D = D, Z = Z, + FE = FE, X.eval = X.eval, weights = weights)[,c(1,3)] + } else { ## bootstrap + ## point estimate + coef <- coefs(data = data, bw = bw, Y = Y, X = X, D = D, Z = Z, + FE = FE, X.eval = X.eval, weights = weights)[,3] + ## boostrap SEs + if (is.null(cl)==FALSE) { ## find clusters + clusters<-unique(data[,cl]) + id.list<-split(1:n,data[,cl]) + } + oneboot <- function() { + if (is.null(cl)==TRUE) { + smp<-sample(1:n,n,replace=TRUE) + } else { ## block bootstrap + cluster.boot<-sample(clusters,length(clusters),replace=TRUE) + smp<-unlist(id.list[match(cluster.boot,clusters)]) + } + s<-data[smp,] + out <- coefs(data = s, bw = bw, Y = Y, X = X, D = D, Z = Z, + FE = FE, X.eval = X.eval, weights = weights)[,3] + return(out) + } + cat("Bootstrapping ...") + if (parallel==TRUE) { + suppressWarnings( + bootout <- foreach (i=1:nboots, .combine=cbind, + .export=c("oneboot","coefs"), + .inorder=FALSE) %dopar% {oneboot()} + ) + cat("\r") + } else { + bootout<-matrix(NA,length(X.eval),nboots) + for (i in 1:nboots) { + bootout[,i] <- oneboot() + if (i%%50==0) cat(i) else cat(".") + } + cat("\r") + } + ## summary + CI.lvl <- c((1-conf.level)/2, (1-(1-conf.level)/2)) + ci<-t(apply(bootout, 1, quantile, CI.lvl)) + est<-data.frame(cbind("X" = X.eval, "ME" = coef, + "SE"=apply(bootout,1,sd), + "CI_lower"=ci[,1], "CI_upper"=ci[,2])) + + } + + if (parallel == TRUE & (CI == TRUE|CV==1)) { + suppressWarnings(stopCluster(pcl)) + cat("\n") + } + + ########################################## + ## plotting + requireNamespace("ggplot2") + if(is.null(Xlabel)==FALSE){ + x.label<-c(paste("Moderator: ", Xlabel, sep="")) + y.label<-c(paste("Marginal Effect of ",Dlabel," on ",Ylabel,sep="")) + } else { + x.label<-c(paste("Moderator: ", X, sep="")) + y.label<-c(paste("Marginal Effect of ",D," on ",Y,sep="")) + } + + ## mark zero + p1 <- ggplot() + geom_hline(yintercept=0,colour="white",size=2) + + ## point estimates + p1 <- p1 + geom_line(data=est,aes(X,ME)) + + ## confidence intervals + if (CI == TRUE) { + p1 <- p1 + geom_ribbon(data=est, aes(x=X,ymin=CI_lower,ymax=CI_upper),alpha=0.2) + yrange<-na.omit(c(est$CI_lower, est$CI_upper)) + } else { + yrange<-na.omit(c(est$ME)) + } + + ## axis labels + p1 <- p1 + xlab(x.label) + ylab(y.label) + theme(axis.title = element_text(size=15)) + + ## ylim + if (is.null(ylim)==FALSE) { + yrange<-c(ylim[2],ylim[1]+(ylim[2]-ylim[1])*1/6) + } + maxdiff<-(max(yrange)-min(yrange)) + + ## plotting moderator distribution + if (is.null(Xdistr) == TRUE) { + Xdistr <- "density" + } else if (Xdistr != "density" & Xdistr != "histogram" & Xdistr != "hist") { + Xdistr <- "density" + } + if (Xdistr == "density") { # density plot + + if (length(unique(data[,D]))==2) { ## binary D + + ## get densities + de.co <- density(data[data[,D]==0,X]) + de.tr <- density(data[data[,D]==1,X]) + + ## put in data frames + deX.ymin <- min(yrange)-maxdiff/5 + deX.co <- data.frame( + x = de.co$x, + y = de.co$y/max(de.co$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + deX.tr <- data.frame( + x = de.tr$x, + y = de.tr$y/max(de.tr$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + + ## color + feed.col<-col2rgb("gray50") + col.co<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) + col.tr<-rgb(red=1, blue=0, green=0) + + ## plotting + p1 <- p1 + geom_ribbon(data = deX.co, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col.co, alpha = 0.2) + + geom_ribbon(data = deX.tr, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col.tr, alpha = 0.2) + + + } else { ## continuous D + + ## get densities + de <- density(data[,X]) + + ## put in data frames + deX.ymin <- min(yrange)-maxdiff/5 + deX <- data.frame( + x = de$x, + y = de$y/max(de$y) * maxdiff/5 + min(yrange) - maxdiff/5 + ) + + ## color + feed.col<-col2rgb("gray50") + col<-rgb(feed.col[1]/1000, feed.col[2]/1000,feed.col[3]/1000) + + ## plotting + p1 <- p1 + geom_ribbon(data = deX, + aes(x = x, ymax = y, ymin = deX.ymin), + fill = col, alpha = 0.2) + + } + + } else { # histogram plot + + if (length(unique(data[,D]))==2) { ## binary D + + hist.out<-hist(data[,X],breaks=80,plot=FALSE) + n.hist<-length(hist.out$mids) + dist<-hist.out$mids[2]-hist.out$mids[1] + hist.max<-max(hist.out$counts) + ## count the number of treated + count1<-rep(0,n.hist) + treat<-which(data[,D]==max(data[,D])) + for (i in 1:n.hist) { + count1[i]<-sum(data[treat,X]>=hist.out$breaks[i] & + data[treat,X]=hist.out$breaks[n.hist] & + data[treat,X]<=hist.out$breaks[n.hist+1]) + ## put in a data frame + histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), + ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, + xmin=hist.out$mids-dist/2, + xmax=hist.out$mids+dist/2, + count1=count1/hist.max*maxdiff/5+min(yrange)-maxdiff/5) + p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), + colour="gray50",alpha=0,size=0.5) + # control + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=count1), + fill="red",colour="grey50",alpha=0.3,size=0.5) # treated + + } else { ## continuous D + hist.out<-hist(data[,X],breaks=80,plot=FALSE) + n.hist<-length(hist.out$mids) + dist<-hist.out$mids[2]-hist.out$mids[1] + hist.max<-max(hist.out$counts) + + histX<-data.frame(ymin=rep(min(yrange)-maxdiff/5,n.hist), + ymax=hist.out$counts/hist.max*maxdiff/5+min(yrange)-maxdiff/5, + xmin=hist.out$mids-dist/2, + xmax=hist.out$mids+dist/2) + p1 <- p1 + geom_rect(data=histX,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), + colour="gray50",alpha=0,size=0.5) + } + } + + ## title + if (is.null(main)==FALSE) { + p1<-p1 + ggtitle(main) + + theme(plot.title = element_text(hjust = 0.5, size=35, + lineheight=.8, face="bold")) + } + + ## xlim and ylim + if (is.null(ylim)==FALSE) { + ylim2 = c(ylim[1]-(ylim[2]-ylim[1])*0.25/6, ylim[2]+(ylim[2]-ylim[1])*0.4/6) + } + if (is.null(xlim)==FALSE & is.null(ylim)==FALSE) { + p1<-p1+coord_cartesian(xlim = xlim, ylim = ylim2) + } + if (is.null(xlim)==TRUE & is.null(ylim)==FALSE) { + p1<-p1+coord_cartesian(ylim = ylim2) + } + if (is.null(xlim)==FALSE & is.null(ylim)==TRUE) { + p1<-p1+coord_cartesian(xlim = xlim) + } + + if (is.null(file)==FALSE) { + pdf(file) + plot(p1) + graphics.off() + } + ###################################################### + + output<-list(bw = bw, + est = est, + graph = p1) + + if (CV == 1) { + output <- c(output, list(CV.out = cv.out$CV.out)) + } + + return (output) +} + +###################################### +## Weighted Least-Squares +###################################### + +coefs <- function(data,bw,Y,X,D, + Z=NULL, + FE=NULL, + weights, + X.eval = NULL, + neval = 50){ + + + ## evaluation points + if (is.null(X.eval) == TRUE) { + X.eval <- seq(min(data$x), max(data$x), length.out = neval) + } + ## conditions + noZ <- is.null(Z) + noFE <- is.null(FE) + + ## survey weights + n <- dim(data)[1] + if (is.null(weights)==TRUE) { + weights <- rep(1, n) + } else { + weights <- data[,weights] + } + + ## storage + result <- matrix(NA, length(X.eval), (5 + length(Z))) + colnames(result)<-c("X","Intercept","ME","x","Dx",Z) + + ## main algorithm + if (noFE) { # no fixed effects, wls + ## formula + if (noZ) { ## no covariates + formula <- as.formula("y ~ d + xx + d_xx") + } else { ## with covariates + formula<- as.formula(paste("y ~ d + xx + d_xx + ",paste(Z,collapse="+"))) + } + ## weighted least squares without fixed effect + wls<-function(x, dat, weights){ + dat1<-dat + dat1$xx<-dat1$x-x + dat1$d_xx <- dat1$d * dat1$xx + dat1$w <- dnorm(dat1$xx/bw) * weights + reg <- lm(formula, data=dat1, weights = w) + result <- c(x, reg$coef) + result[which(is.na(result))] <- 0 + return(result) + } ## end of WLS function + + ## data to be used + dat<-data[, c(Y, D, X, Z)] + colnames(dat)[1:3] <- c("y","d","x") + ## estimation (a loop) + for (i in 1:length(X.eval)) { + result[i,] <- wls(X.eval[i], dat, weights = weights) + } + result <- data.frame(result) + } else{ # with fixed effects + ## data to be used + dat <- data[, c(Y, D, X, Z)] + dat.FE <- as.matrix(data[,FE],drop = FALSE) + for (i in 1:length(X.eval)) { + xx <- dat[,X]-X.eval[i] + dat1 <- as.matrix(cbind(dat[,Y],dat[,D], + xx, dat[,D] * xx, dat[,Z])) + w<-dnorm(xx/bw) * weights + estcoef <- fastplm(data = dat1, FE = dat.FE, + weight = w, FEcoefs = 0)$coefficients + estcoef[which(is.nan(estcoef))] <- 0 + result[i,] <- c(X.eval[i],0,estcoef) + } + result <- data.frame(result) + } + return(result) +} + + +###################################### +## Cross-validation +###################################### + +crossvalidate <- function(data, Y, D, X, Z = NULL, weights = NULL, + FE = NULL, cl = NULL, + X.eval, grid, nfold = 5, parallel = FALSE, bw, + metric = "MSPE"){ + + + ## %dopar% <- NULL + + + ## calculate error for testing set + getError <- function(train, test, bw, Y, X, D, Z, FE, weights, X.eval){ + + coef<-coefs(data=train,bw=bw,Y=Y,X=X,D=D,Z=Z,FE=FE, + weights = weights, X.eval= X.eval) + coef[is.na(coef)] <- 0 + n2<-length(Z) + esCoef<-function(x){ + Xnew<-abs(X.eval-x) + d1<-min(Xnew) ## distance between x[i] and the first nearest x in training set + label1<-which.min(Xnew) + Xnew[label1]<-Inf + d2<-min(Xnew) ## distance between x[i] and the second nearest x in training set + label2<-which.min(Xnew) + if(d1==0){ + func <- coef[label1,-c(1,4,5)] # X.eval (1), intercept (2), d (3), xx (4), d:xx (5), z + } else if(d2==0){ + func <- coef[label2,-c(1,4,5)] + } else{ ## weighted average + func <- (coef[label1,-c(1,4,5)] * d2 + coef[label2,-c(1,4,5)] * d1)/(d1 + d2) + } + return(func) + } + Knn<-t(sapply(test[,X],esCoef)) ## coefficients for test class==matrix + + ## predicting + test.Y <- test[,Y] + test.X <- as.matrix(cbind(rep(1,dim(test)[1]),test[,c(D,Z)])) + sumOfEst<-matrix(lapply(1:length(test.X), function(i){test.X[i]*Knn[[i]]}), + nrow=nrow(test.X), ncol=ncol(test.X)) + error<-test.Y - rowSums(matrix(unlist(sumOfEst),length(test.Y))) + return(c(mean(abs(error)),mean(error^2))) + } + ## grid search and 5 fold cross validation + cv<-function(bw){ + mse<-matrix(NA,nfold,2) + for(j in 1:nfold){ # 5-fold CV + testid <- which(fold==j) + mse[j,] <- getError(train= data[-testid,], test = data[testid, ], + bw = bw, Y=Y, X=X, D=D, Z=Z, FE=FE, weights = weights, X.eval= X.eval) + } + return(c(bw, apply(mse,2,mean))) + } + + + cat("Cross-validating bandwidth ... ") + ## generate 5 random folds + ## if is.null(cluster)==false then fold by cluster + n<-dim(data)[1] + fold<-rep(0,n) + + if(is.null(cl)==TRUE){ + fold <- c(0:(n-1))%%nfold + 1 + fold<-sample(fold, n, replace = FALSE) + } else{ + clusters<-unique(data[,cl]) + m <- length(clusters) + id.list<-split(1:n,data[,cl]) + cl.fold <- c(0:(m-1))%%nfold + 1 + cl.fold <- sample(cl.fold, m, replace = FALSE) + for (i in 1:m) { + id.list[[i]] <- rep(cl.fold[i],length(id.list[[i]])) + } + fold <- unlist(id.list) + } + + ## preprocessing if there's fixed effects + if (is.null(FE)==FALSE) { # with FE, first demean test data + for (var in c(Y,D,Z)) { + data[,var] <- fastplm(data = as.matrix(data[, var]), + FE = as.matrix(data[, FE],drop = FALSE), + weight = rep(1, n), + FEcoefs = 0)$residuals + } + } + ## calculation + if (parallel == TRUE) { + Error<-suppressWarnings(foreach(bw = grid, .combine = rbind, + .export = c("coefs","getError"), + .inorder = FALSE) %dopar% {cv(bw)}) + } else { + Error <- matrix(NA, length(grid), 3) + for (i in 1:length(grid)) { + Error[i, ] <- cv(grid[i]) + cat(".") + } + } + colnames(Error) <- c("bandwidth","MAPE","MSPE") + rownames(Error) <- NULL + + if (metric=="MAPE") { + opt.bw <- grid[which.min(Error[,2])] + } else { + opt.bw <- grid[which.min(Error[,3])] + } + output <- list(CV.out = round(Error,3), + opt.bw = opt.bw) + cat(paste("Bandwidth =", round(output$opt.bw,3),"\n")) + return(output) +} + + +############# + +## vcovCluster.r +## function to compute var-cov matrix using clustered robust standard errors +## inputs: +## model = model object from call to lm or glm +## cluster = vector with cluster ID indicators +## output: +## cluster robust var-cov matrix +## to call this for a model directly use: +## coeftest(model,vcov = vcovCluster(model, cluster)) +## formula is similar to Stata's , cluster command + +vcovCluster <- function( + model, + cluster +) +{ + requireNamespace("sandwich") + ## require(lmtest) + + if(nrow(model.matrix(model))!=length(cluster)){ + stop("check your data: cluster variable has different N than model") + } + M <- length(unique(cluster)) + N <- length(cluster) + K <- model$rank + ## if(M<50){ + ## warning("Fewer than 50 clusters, variances may be unreliable (could try block bootstrap instead).") + ## } + dfc <- (M/(M - 1)) * ((N - 1)/(N - K)) + uj <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum)); + rcse.cov <- dfc * sandwich(model, meat. = crossprod(uj)/N) + ##colnames(rcse.cov)<-rownames(rcse.cov)<-names(model$coefficients) + return(rcse.cov) +} + diff --git a/README.md b/README.md old mode 100644 new mode 100755 index addae64..786c6bd --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # interflex -## Multiplicative Interaction Models Diagnostics and Visualization, Producing Flexible Marginal Effect Estimates +## Multiplicative Interaction Models Diagnostics and Visualization --- @@ -8,7 +8,7 @@ **Maintainer:** Yiqing Xu [] -**How to Uses:** [Examples](http://yiqingxu.org/software/interaction/RGuide.html) +**How to Uses:** [Examples](http://yiqingxu.org/software/interplot/RGuide.html) **Reference:** "How Much Should We Trust Estimates from Multiplicative Interaction Models? Simple Tools to Improve Empirical Practice." Available at SSRN: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2739221 diff --git a/man/inter.binning.Rd b/man/inter.binning.Rd old mode 100644 new mode 100755 index ff7932d..d3699a1 --- a/man/inter.binning.Rd +++ b/man/inter.binning.Rd @@ -3,11 +3,12 @@ \title{The Binning Estimator} \description{Implementing the binning estimator, a generalization of the multiplicative interaction model and conducting various diagnostic tests} -\usage{inter.binning(Y, D, X, Z = NULL, weights = NULL, FE = NULL, - data, nbins = 3, cutoffs = NULL, vartype = "homoscedastic", - cl = NULL, time = NULL, pairwise = TRUE, - Ylabel = NULL, Dlabel = NUL, Xlabel = NULL, - main = NULL, xlim = NULL, ylim = NULL, Xdistr = "histogram") +\usage{inter.binning(Y, D, X, Z = NULL, FE = NULL, weights = NULL, + data, na.rm = FALSE, nbins = 3, cutoffs = NULL, + vartype = "homoscedastic", + cl = NULL, time = NULL, pairwise = TRUE, figure = TRUE, main = NULL, + Ylabel = NULL, Dlabel = NULL, Xlabel = NULL, + xlim = NULL, ylim = NULL, interval = NULL, Xdistr = "histogram", wald = TRUE) } \arguments{ \item{Y}{a string, name of the outcome variable.} @@ -58,6 +59,10 @@ \item{Xdistr}{a string indicating the way the distribution of the moderator will be plotted, either "histogram" (or "hist") or "density". The default is "histogram".} + \item{figure}{a logical flag indicating whether to draw the graph. + The default is TRUE.} + \item{interval}{manually set the interval for the graph.} + \item{wald}{wald test.} } \details{ \bold{inter.binning} implements the binning estimator. There are @@ -154,7 +159,7 @@ inter.binning(Y = "Y", D = "D", X = "X", Z = "Z1", data = s2, cutoffs = c(1,2,4,5)) ## with fixed effects -s4$wgt <- 1 +s4[,"wgt"] <- 1 inter.binning(Y = "Y", D = "D", X = "X", Z = "Z1", weights = "wgt", FE = c("group","year"), data = s4, cl = "group", vartype = "cluster") diff --git a/man/inter.gam.Rd b/man/inter.gam.Rd old mode 100644 new mode 100755 index 59f76c9..77a0c28 --- a/man/inter.gam.Rd +++ b/man/inter.gam.Rd @@ -3,7 +3,8 @@ \title{Visualization of a Generalized Additive Model (GAM)} \description{Estimating and plotting a GAM before estimating a multiplicative interaction model} \usage{inter.gam(Y, D, X, Z = NULL, weights = NULL, FE = NULL, - data, SE = FALSE, k = 10, angle = c(30,100,-30,-120)) + data, SE = FALSE, k = 10, angle = c(30,100,-30,-120), + Ylabel = NULL, Dlabel = NULL, Xlabel = NULL) } \arguments{ \item{Y}{a string, name of the outcome variable.} diff --git a/man/inter.kernel.Rd b/man/inter.kernel.Rd old mode 100644 new mode 100755 index d08616f..9c02b15 --- a/man/inter.kernel.Rd +++ b/man/inter.kernel.Rd @@ -6,14 +6,14 @@ linear interaction effect (LIE) assumption of conventional multiplicative interaction models and safeguards against excessive extrapolation } -\usage{inter.binning(Y, D, X, Z = NULL, weights = NULL, FE = NULL, +\usage{inter.kernel(Y, D, X, Z = NULL, weights = NULL, FE = NULL, data, na.rm = FALSE, CI = TRUE, conf.level = 0.95, cl = NULL, neval = 50, nboots = 200, parallel = TRUE, cores = 4, seed = 02139, bw = NULL, grid = 20, metric = "MSPE", - Ylabel = NULL, Dlabel = NUL, Xlabel = NULL, + Ylabel = NULL, Dlabel = NULL, Xlabel = NULL, main = NULL, xlim = NULL, ylim = NULL, - Xdistr = "histogram") + Xdistr = "histogram", file = NULL) } \arguments{ \item{Y}{a string, name of the outcome variable.} @@ -68,7 +68,8 @@ improve aesthetics).} \item{Xdistr}{a string indicating the way the distribution of the moderator will be plotted, either "histogram" (or "hist") or - "density". The default is "histogram".} + "density". The default is "histogram".} + \item{file}{save the output graph to the file.} } \details{ \bold{inter.kernel} implements a kernel smoothing estimator of the @@ -127,13 +128,8 @@ Available at SSRN: \url{https://papers.ssrn.com/abstract_id=2739221}. \examples{ library(interflex) data(interflex) -inter.kernel(Y = "Y", D = "D", X = "X", Z = "Z1", - data = s3, nboots = 200, bw = 1.3) - -## with fixed effects -inter.kernel(Y = "Y", D = "D", X = "X", Z = "Z1", FE = c("group","year"), - data = s4, cl = "group") - +inter.kernel(Y = "Y", D = "D", X = "X", Z = "Z1", data = s3, + nboots = 200, bw = 1.3, parallel = FALSE) } \keyword{graphics} diff --git a/man/inter.raw.Rd b/man/inter.raw.Rd old mode 100644 new mode 100755 index 988aa8b..071293d --- a/man/inter.raw.Rd +++ b/man/inter.raw.Rd @@ -2,9 +2,9 @@ \alias{inter.raw} \title{Plotting Raw Data} \description{Plotting raw data before estimating a multiplicative interaction model} -\usage{inter.rawgplot(Y, D, X, weights = NULL, data, +\usage{inter.raw(Y, D, X, weights = NULL, data, nbins = 3, cutoffs = NULL, span = NULL, - Ylabel = NULL, Dlabel = NULL, Xlabel = NULL) + Ylabel = NULL, Dlabel = NULL, Xlabel = NULL, pos = NULL) } \arguments{ \item{Y}{a string, name of the outcome variable.} @@ -32,7 +32,8 @@ smoother lines (from \bold{ggplot2}).} \item{Ylabel}{a string that controls the label of Y in the plot.} \item{Dlabel}{a string that controls the label of D in the plot.} - \item{Xlabel}{a string that controls the label of X in the plot.} + \item{Xlabel}{a string that controls the label of X in the plot.} + \item{pos}{the interval of Y.} } \details{ \bold{inter.raw} provides a simple visual diagnostic, a scatterplot, diff --git a/man/interflex-package.Rd b/man/interflex-package.Rd old mode 100644 new mode 100755 index a4cea84..85d37c1 --- a/man/interflex-package.Rd +++ b/man/interflex-package.Rd @@ -1,8 +1,8 @@ \name{interflex-package} \alias{interflex} \docType{package} -\title{\packageTitle{interflex}} -\description{\packageDescription{interflex}} +\title{Multiplicative Interaction Models Diagnostics and Visualization} +\description{Producing Flexible Marginal Effect Estimates with Multiplicative Interaction Models} \details{This package performs diagnostics and visualizations of multiplicative interaction models. Besides conventional linear @@ -15,7 +15,7 @@ safegiard against excessive extrapolation.} \author{Jens Hainmueller; Jonathan Mummolo; Yiqing Xu (Maintainer).\cr - Please write to if you find any bugs.} + Please report bugs to .} \references{Jens Hainmueller; Jonathan Mummolo; Yiqing Xu. 2016. "How Much Should We Trust Estimates from Multiplicative Interaction Models? Simple Tools to Improve Empirical Practice." Available at SSRN: \url{https://papers.ssrn.com/abstract_id=2739221}.} diff --git a/man/s1.Rd b/man/s1.Rd new file mode 100755 index 0000000..41659fd --- /dev/null +++ b/man/s1.Rd @@ -0,0 +1,10 @@ +\name{s1} +\docType{data} +\alias{s1} +\title{s1} +\description{Simulated dataset 1: dichotomous treatment with linear + marginal effects.} +\usage{s1} +\format{dataframe} +\keyword{datasets} + diff --git a/man/s2.Rd b/man/s2.Rd new file mode 100755 index 0000000..695ef78 --- /dev/null +++ b/man/s2.Rd @@ -0,0 +1,10 @@ +\name{s2} +\docType{data} +\alias{s2} +\title{s2} +\description{Simulated dataset 2: continuous treatment with linear + marginal effects.} +\usage{s2} +\format{dataframe} +\keyword{datasets} + diff --git a/man/s3.Rd b/man/s3.Rd new file mode 100755 index 0000000..a3030bd --- /dev/null +++ b/man/s3.Rd @@ -0,0 +1,10 @@ +\name{s3} +\docType{data} +\alias{s3} +\title{s3} +\description{Simulated dataset 3: dichotomous treatment with nonlinear + marginal effects.} +\usage{s3} +\format{dataframe} +\keyword{datasets} + diff --git a/man/s4.Rd b/man/s4.Rd new file mode 100755 index 0000000..f9a15fb --- /dev/null +++ b/man/s4.Rd @@ -0,0 +1,9 @@ +\name{s4} +\docType{data} +\alias{s4} +\title{s4} +\description{Simulated dataset 4: dichotomous treatment with fixed effects..} +\usage{s4} +\format{dataframe} +\keyword{datasets} + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp old mode 100644 new mode 100755 diff --git a/src/fastplm.cpp b/src/fastplm.cpp old mode 100644 new mode 100755 index fe7b19b..87ed08e --- a/src/fastplm.cpp +++ b/src/fastplm.cpp @@ -1,300 +1,300 @@ -#include -// [[Rcpp::depends(RcppArmadillo)]] - -using namespace Rcpp; - -// [[Rcpp::export()]] -List fastplm(arma::mat data, - arma::mat FE, - arma::colvec weight, - int FEcoefs = 0 - ){ - - // parse data - int n = data.n_rows; - int k = data.n_cols; - int m = FE.n_cols; - int p = k-1; // No. of covariates - arma::mat data_bak = data; - arma::mat data_wei = data; - arma::mat data_old = arma::zeros(n, k); - double diff = 100; - - /*if(weight.is_empty()){ - weight = arma::ones(n, 1); - }*/ - - // declare variables - arma::colvec y; // n*1 - arma::mat X; // n*p - arma::colvec resid; // n*1 - arma::colvec e; // n*1 (with fixed effects) - arma::colvec coeff; // coefficient (full) - //arma::colvec se; // SE (full) - arma::colvec coef; // coefficient - //arma::colvec stderror; // SE - int df; // degrees of freedom - double sig2; // sigma2 - double mu; // grand mean - arma::colvec LHS; // group means - arma::mat W; // big weighting matrix to calculate fixed effects - arma::colvec alphas; // fixed effect coefficients - - /* count total number of groups (loss of degrees of freedom) */ - arma::mat FEvalues; - for(int ii=0; ii 1e-5) & (niter<50)) { - - // demean Y and X - for(int ii=0; ii0) { - X = data.cols(1, p); // n*p - //store coefficents and check variation of X - coeff =arma::zeros(p,1); //store coefficients - //se =arma::zeros(p,1); - - // check X variation - int cc = p; - for(int i=0;i0) { - // stderror = arma::sqrt(sig2 * arma::diagvec(arma::inv(arma::trans(X)*X))); - // } - - // fill in non-NaN coefficients - int count=0; - //String label = xname(0); - for(int i=0; i 0) { - X = data.cols(1, p); // n*p - coef = coeff; - for (int i=0; i 0) { - e = e - X * coef; - } - - arma::colvec LHS(gtot, arma::fill::zeros); - arma::mat W(gtot, gtot, arma::fill::zeros); - arma::colvec FEval1 = arma::unique(FE.col(0)); - arma::colvec FEval2 = arma::unique(FE.col(1)); - int f1 = FEval1.n_rows; - int f2 = FEval2.n_rows; - - for (int i=0; i 0) { - output["coefficients"] = coeff; - //output["stderr"] = se; - } - output["residuals"] = resid; - output["niter"] = niter; - output["FEvalues"] = FEvalues; - if (FEcoefs == 1) { - output["mu"] = mu ; - output["ngroups"] = gtot; - } - return(output); - -} - - -// [[Rcpp::export()]] -arma::colvec fastplm_predict(double mu, - arma::mat FEvalues, // 3 columns - arma::mat FE, - arma::mat newx, - arma::mat beta) { - // parse data - int n = newx.n_rows; - int p = newx.n_cols; - int m = FE.n_cols; // number of fixed effects - int g = FEvalues.n_rows; - - // number of FE levels in each grouping - arma::ivec nlvls(m, arma::fill::zeros); - for (int i=0; i 0) { - pred_y = pred_y + arma::vectorise(newx * beta); - } - - //fixed effects - for (int i=0; i +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; + +// [[Rcpp::export()]] +List fastplm(arma::mat data, + arma::mat FE, + arma::colvec weight, + int FEcoefs = 0 + ){ + + // parse data + int n = data.n_rows; + int k = data.n_cols; + int m = FE.n_cols; + int p = k-1; // No. of covariates + arma::mat data_bak = data; + arma::mat data_wei = data; + arma::mat data_old = arma::zeros(n, k); + double diff = 100; + + /*if(weight.is_empty()){ + weight = arma::ones(n, 1); + }*/ + + // declare variables + arma::colvec y; // n*1 + arma::mat X; // n*p + arma::colvec resid; // n*1 + arma::colvec e; // n*1 (with fixed effects) + arma::colvec coeff; // coefficient (full) + //arma::colvec se; // SE (full) + arma::colvec coef; // coefficient + //arma::colvec stderror; // SE + int df; // degrees of freedom + double sig2; // sigma2 + double mu; // grand mean + arma::colvec LHS; // group means + arma::mat W; // big weighting matrix to calculate fixed effects + arma::colvec alphas; // fixed effect coefficients + + /* count total number of groups (loss of degrees of freedom) */ + arma::mat FEvalues; + for(int ii=0; ii 1e-5) & (niter<50)) { + + // demean Y and X + for(int ii=0; ii0) { + X = data.cols(1, p); // n*p + //store coefficents and check variation of X + coeff =arma::zeros(p,1); //store coefficients + //se =arma::zeros(p,1); + + // check X variation + int cc = p; + for(int i=0;i0) { + // stderror = arma::sqrt(sig2 * arma::diagvec(arma::inv(arma::trans(X)*X))); + // } + + // fill in non-NaN coefficients + int count=0; + //String label = xname(0); + for(int i=0; i 0) { + X = data.cols(1, p); // n*p + coef = coeff; + for (int i=0; i 0) { + e = e - X * coef; + } + + arma::colvec LHS(gtot, arma::fill::zeros); + arma::mat W(gtot, gtot, arma::fill::zeros); + arma::colvec FEval1 = arma::unique(FE.col(0)); + arma::colvec FEval2 = arma::unique(FE.col(1)); + int f1 = FEval1.n_rows; + int f2 = FEval2.n_rows; + + for (int i=0; i 0) { + output["coefficients"] = coeff; + //output["stderr"] = se; + } + output["residuals"] = resid; + output["niter"] = niter; + output["FEvalues"] = FEvalues; + if (FEcoefs == 1) { + output["mu"] = mu ; + output["ngroups"] = gtot; + } + return(output); + +} + + +// [[Rcpp::export()]] +arma::colvec fastplm_predict(double mu, + arma::mat FEvalues, // 3 columns + arma::mat FE, + arma::mat newx, + arma::mat beta) { + // parse data + int n = newx.n_rows; + int p = newx.n_cols; + int m = FE.n_cols; // number of fixed effects + int g = FEvalues.n_rows; + + // number of FE levels in each grouping + arma::ivec nlvls(m, arma::fill::zeros); + for (int i=0; i 0) { + pred_y = pred_y + arma::vectorise(newx * beta); + } + + //fixed effects + for (int i=0; inE!mN5M=4igM~S?6?Zb{-rjFog zebn*1yS}1Tu1xS4+=0Yn31Ij3^evSy`5E~W`7!w?`IMvwkD0~-yNi90Jc?U&v>P-1 znx5{S8DOXN^xFpq2e%Gx-@bKl>ox|R){EzN$Uq-QZjd;6!?($Rd~U>452ks`{k-MW@M%0YESk69h4!N_xX8a` zkDd|7!)xCRy}b21(E8BzOKsnUwnyG1zB~$pIP@rLKBmdhk>PoU$(tF6V=pufPpHIX zrRtRUzROx&@7(Yv1@ZTw=`5l#ji&U8`HWf#8q|P#6vkC*XG;7rFvCe;V?NOA5YQ{M z&H*=!1Q&-`A&o3rYQM+Y>C`})ImB@T`_UxGo8Dth7c_a0B#~>1u{y*JE@|W@F{X8c zI05luj-V+%o$!`-Sc_)_7t(x#Jl~+qH>jAaM}v#x61Vuzk1mY}E9gHTg3iGO#tM>4 z%Z_kM7=Twrco5?#OoXLV1H^L%NE;_X88aoGl{5nbm%R47WT=nGa5U(j9uxiXh@-LQ z(Szd>+PQu-dNM$0?^Y@AlF^^e^e2xvA$Y|VOdvQ-Jn;*+_&yOp@k+0%OkyBU9V868ls&BHvKHG?Kop>qCiTe>u%l zM$^m0lOzPO*Mvq<7;UR!&i^amFH6iYbY{^p8$7XCa-i{!Bz(dosDe8TBEps)COzJn zll%fmI=P)H<^f5{QRuNJH^=`7@Q=^Gk|c;d>p9cPqGu^=(g6a(fCvyfW}>jHW~$nC&_f? zCWF)Cp%sp)UK-;k^%M$aP(2AVV1_TX2B0i8d~@RTADb;*SL5rz(u;`t`z4s|Sv;O3 zfezDCq{P2lg5C|MG;+M~iqG}WmS8_@aUGVTNgQqAprpowlGYoPwAP@cW`mMSgZeQ- zV{2QMFc6;w2!g{yfglgEWta5*vDatLsTJ$=5S-Dtpt^1h`@-k-!B8fHlTneuK195N zFd+u<4RG}t(q|!x(CAE{DC~b9KvJ+>2#Fp`59BH`#{E21H`0S04h>?+ zG(~p$iQua;(N^e>jl_jr4bpE3lF;xE97&CnaI7y0@K#-fw-zPP4rkoX@Fn%j)I9oZ z97YhY{c?`9b+*uPy*}dP-9eZ+O%?`6(rO%JgvB_k_(%<8NeyI`y<8SJJb$$^OIxsX z^dGyDQd)`?(wtJQr~Fk7r40KUAJ(!dB}UBbKO&s6F`3(xtmDRnwWOQYbRVP7llfIL zS)!@0!1mOJMFF-F#7)Dw6fy8O6)`hnD{bL%9meS*Vj@Gf!wr(YnNGgGu~6=!*nDUqz#WkGay0+ZmPX&xkmK@gq{#SUvjs5;3)V6O86kX7gMcEo zwp%J^(38rNqh4JaNqKoYsaXfgXdffAxpZ7c{7@hYrOGJjGJTTSyUmiXM|+c)5(l<= zi9TRiS$6HxVv%Ea-7qGQG-w-=El_fnY{w2WC`n#;0U0G7CzyI?%fSaV{E?*}XWg6c zEZg9rft#?Y_j8`V-xW{4@TM=e<(OJ@s1^SGa9e)Jf^Ec0$#@lOP%SfEfz6*@jI6sZ zwdSxlEL{3;*QIaAuxXF3nL)%t^_va3ATy}zvOw8jecEm0%AEEQqSd$?}A$s^k9b2*Z?p+?aWBWmC@Hf#bL3=&QfNMH-d*%V2=kW1makp75_ zT13}jzjAgmRLQW*{s!1pbZE{+|Emx!Lx6)k!Xz0Ma3e8&F`;~-T;|~ZMsQnF18nSVOmKtwh=|XX(f} zNBz0IZr=C3yia)be8>D-<#eKljohIgYu+7(E8JnY(jA5k++n!d9cobK-C?-CI}8`O zLn?E$TCExsq!v4Rjm}S{79rMbW|;0&k~$>&_&`|dh9iR$7TCY6V#@%>?jnMpW&}-U zyB=$Pw$E40_W8=$K5sDF=c~--&`5Kk#y2iJt zU)JF-`x0el1ne!M)J|XjXA{DLQeyfH*4!_g4K<>(dE#6eNRoFVsrc%D7bMvp-zfeC zJO)>2SIg2|aFM?7C456B-rYe`G)O)tl+3Su?-d9Mn}xjbukuEdUbq1;POM9jLeid8 z*d(B{F@=44@#~fQc`s#qVMO)AyxoatRf9s<#l(nUxi-V7P~!*X9_z)#jc9FOl>G`M z@B2>Pm+ryQIPEwjm#|OgKTBbHUCd@=Ortmi*K(6?LLN)j<#WPLjYwh9kt&e) z+(}2s(Nd3fJUS!!2WuXIm7e00Kc8|R?_2IN-|}c-v2Xox58Pm2Q~{QBRdF6}2P_e> z2VOC7Lmcw;`uxs;Ed!(pu$DfvGtj0ycXB$59ZDjbIDGCWmE3$d{Eaf=vXg2v!6vV;JGch^O2V zu;d81C18n)ad&Njq9GMDgTo3|({3YJMl6wP3gzr2L@INZw`HwrZCT>1ZwaN878Mwl z5U5C~-O+?1CK`~9U;_=PE$|gp@R7O-8jzrs93fY3brBBIK|=GPVI7cBTfF^-gv+al zNV$uApd-HmiK>m_(~<;Cy9h_bi7J90a0~>acAFxeZA6RkO=&}g4Hdyiu^wV>V4-Lr zhK%r$k|@!sJqry4wVTzfht24S6lQf$`a_yfk=j(JC16{^=@P+mw<1WEQJ#`f>_{kf zq~0=BO%c(MsZo0usizg9X62H`Y!*v&gYj62B zy3t#`d;6O1&^Es_`)(C$!M)i{uB@(mdy`!~I6luXwTDJB@u61<4F02qCf6(Xw-eu# z+ljvxXie@GvRf*17o1+!UBET?<__JAFPg?)dKZfQr2WF*e{|}80Me=a0%YaAE%Y0u);h|gpFhvm zqw9O=tvaDuj|OawA+FPZ8b^h?iW?Ina+6D+Ca%>VszledbRpEIy2@5ZoQS@OS1ErZ zaQF{!x)(H{5qLuCFtA3=t}|=v3an$5nEYZb7RIm=pmVo|H|56zbB>=~C+ME!S>lte_6}oiLGsrWm(XrDUvmGVQ}mkMlN(*D>MqP zYK7JhXsKK+m)k4LovbByi3=)u@9!%p-!@zn_1xw*zs??}d=EMvNs40c_d?X-aN*-6H>F=wfsv+#mlb8e#Ze+bT9BPOwBz@!p((u!b@ z(|`O-<~)I={A4`m#GP7A zFKwn-6k3;)h=(vuN1k39OQR}h{kt00*;%$~9%R7^pP$e3F@_m_C)bhbm*f5m$5mN? zRfAW`W}BFvk=<+9B*s4AQfiOX3h!c%v5>7qBKmWMw4G%< z&>9jYiKlkIPuejD0Q!PltqO{(uh6T z=9*9^z4R7QG_SwMOf}4O;q}leUj-<*m41gf%5c2JJlU-~KMb-LAng8NdV3_VrJ3L9 z`{tI0`6};LqDjDZHM>{f$F&mPxau_9bA(`YzovrY*5mH+H@NznP(x>H-O>%T(5l!qL^?{PYXf3w8)9Lu9wKg7rlI@^a$kFC zQSGG-DhJhkv)xy4+E;L*wbUDImeY`%Uzy>gjbB_?I}p~;19vg5#x8lf17X!4Nj%vB z*BZ~?3EQd@AMb!ITDEq=PAp`vu}~S_`gie2maU^pZL=$m(#|=Gyz;8bvpq#}I_%UH z(oLJVD^5%qY3Q~vKKJ2oc{7Adk2%1%*p1IeK2NkOhF3F>!|eBdvW=<&qX)5 z6_eRi&~#4F(n50Ru-D1NXieoj%jEDsE^uF*?i*H|&dzvlAI5t2EpPs0n|`xF%j+&> zRSp*~N0`(mHL0Z}Y?meeKE_yeZ)0^7DBD-*t1YUx{Kdpl(q{Ge)Dmg4Q({58RV}|q z3y9a-Boy5GC2rd|8A>&aN)5hW^p#(0=Dn}C5aGudYo*g}T}$v$hO*099PV%ycT<_L zpP}qdt=C$-@MDa%(nGecKlq5C3WK}xkn^qb5$>W>RaP${usX%MKG)N#r8!W-v;33PwtH^$@lgEH`W1m?h-q%EV-n_ zWE1I4sCHiWvnu`NjeSd(*!dc_md-aMJYOF0x@(QRB;ZgJwee6zvOAL6;zi)#q0Ju- zJCdkn*O3$ii;8m{RLHV}ibJ*?3}YM_wVMbwghrBrA7m3fbS3f7q8ffs@k19qScpcF zYO7&(TAWpFHO#idd5CJ5jI|i0hARAN2C}cz!w($=J!ps*n4Z%+T|8z~LC+CGKI}Q7 zOkuZcA*~RBi3l7OnMiI4Y6n$cbj%JCZDY1AJ}ZYERL9YvqGDgSCo1`MM2)~hL&L*H zbrH1=k{~%~LRI7`jwsc6Xe*-KwuMTTEW}YE?$E+>(ZY`3lEolQMBO!oUz%+p*=UIw zHY{YaHi{EGo)$&6Y}wWFw}(AZ;qZo}iMIzh?5SeTdu`z{y{-t2UJtih2o}E3Lj`7g z8cHgtYHv?uF@Qy+_fUb;j*0?Wv#JVrMAcK#*cBy~cV!s}K3PT&BJkls?cxW#j;JV` zsR~NEs>lTJpiwNRgXJu#e_E@hX5ojmirE5zjkU6I>cI;Os?f?F6pV&PZ<}~nrS%nC zOsNJu!|$RcSZbjM&A~kedT3!)GzIl2%Hr~<$o&=MhGM#4EkrJf>MOUMHiEaTV+z41 zBk9<4+Sv6JxiqdOoVCtAr|yA0um|?Q9@qnWU=O_MKw4b)TmN?;ite&vx_QyvTdaLo wMRrS7eD4$`eD9~+v&Q$>?^pEMIE<3`)8ZrgJ9(*9XaF|<4+c&e)Pa-$0E74zp#T5? literal 0 HcmV?d00001 diff --git a/stata/interflex.ado b/stata/interflex.ado index 8034458..d77be38 100644 --- a/stata/interflex.ado +++ b/stata/interflex.ado @@ -35,7 +35,8 @@ version 12 local y `Y' local x `X' local d `D' - local var `y' `d' `varlist' + local z `varlist' + local var `y' `d' `z' /* check input and set default */ /* type */ @@ -75,6 +76,7 @@ version 12 } if ("`vce'"=="cluser"& "`cluster'"=="") { disp as error "Clustering variable not specified" + exit } if ("`vce'"=="cluster" & "`cluster'"!="") { local vce = "cl " + "`cluster'" @@ -139,6 +141,12 @@ version 12 disp as err "Weights not supported with the kernel estimation" exit } + /*check X more than 5 values if type = kernel*/ + qui tab `x' + local xcount = r(r) + if("`type'"=="kernel"&`xcount'<5){ + disp as res "Moderator has less than 5 values; consider a fully saturated model." + } preserve @@ -154,7 +162,7 @@ version 12 if("`type'"!="kernel"){ // binning or linear tempvar dx gen `dx' = `x'*`d' - local var `y' `d' `x' `dx' `varlist' + local var `y' `d' `x' `dx' `z' if("`vce'"!="bootstrap"){ if("`fe'"==""){ qui: reg `var' `wgt', vce(`vce') @@ -208,14 +216,14 @@ version 12 mat colnames margeff = xlevel marg se CI_l CI_u N local binary = 0 } - scalar nc = colsof(margeff) - scalar quant = invnormal(0.975) + /*local nc = colsof(margeff)*/ + local quant = invnormal(0.975) /****** cross-validation *****/ if("`type'"=="kernel" & "`bw'"=="0") { local CV = 1 mat kernel = J(`grid',1,0) - local first = log((`xmax' - `xmin')/20) + local first = log((`xmax' - `xmin')/10) local last = log(`xmax' - `xmin') local step = (`last'-`first')/`grid' @@ -252,9 +260,8 @@ version 12 } /* number of covariates (Z) */ - local con: word count `varlist' - scalar con1 = `con'+1 - local con2 `=con1' + local con: word count `z' + local con1 = `con'+1 forvalues k = 1(1)`grid'{ forvalues i = 1(1)5{ @@ -267,45 +274,62 @@ version 12 qui: gen `wei' = sqrt(normalden(`xx'/el(gs,`k',1))) qui: gen `dx' = `xx'*`d' if ("`fe'"==""){ - qui reg `var' `xx' `dx' [weight= `wei'] if `fold' != `i' + cap qui reg `var' `xx' `dx' [weight= `wei'] if `fold' != `i' } else { - qui reghdfe `var' `xx' `dx' [weight= `wei'] if `fold' != `i', absorb(`fe') + cap qui reghdfe `var' `xx' `dx' [weight= `wei'] if `fold' != `i', absorb(`fe') } - mat coe = e(b) - forvalues kk = 1(1)`con2'{ - mat coef1[`j',`kk'] = el(coe,1,`kk') + if(_rc==0){ + mat coe = e(b) + forvalues kk = 1(1)`con1'{ + mat coef1[`j',`kk'] = el(coe,1,`kk') + } + if("`fe'"==""){ + mat coef1[`j',2+`con'] = el(coe,1,colsof(coe)) + } + else{ + mat coef1[`j',2+`con'] = 0 + } + cap drop `xx' `wei' `dx' + cap mat drop coe } - if("`fe'"==""){ - mat coef1[`j',2+`con'] = el(coe,1,colsof(coe)) + else { + mat kernel[`k',1] = . } - else{ - mat coef1[`j',2+`con'] = 0 - } - cap drop `xx' `wei' `dx' - cap mat drop coe } - mata:sumofse("`y' `x' `d' `varlist' `fold'","eval","coef1","fol") - mat kernel[`k',1] = el(kernel,`k',1) + r(SE) - cap mat drop coef1 + if(el(kernel,`k',1)!=.){ + mata:sumofse("`y' `x' `d' `z' `fold'","eval","coef1","fol") + mat kernel[`k',1] = el(kernel,`k',1) + r(SE) + cap mat drop coef1 + } + else { + mat kernel[`k',1] = . + } } } - local j = 1 - scalar coordinate = 1 - scalar min = kernel[1,1] + local coordinate = 1 + local min = kernel[1,1] while `j'<`grid'{ local ++j - if(el(kernel,`j',1)<`=min'){ - scalar min = kernel[`j',1] - scalar coordinate = `j' + if(el(kernel,`j',1)<`min'){ + local min = kernel[`j',1] + local coordinate = `j' } } - local bw = exp(`first'+`step'*(`=coordinate'-1)) + local bw = exp(`first'+`step'*(`coordinate'-1)) mat cv = gs, kernel mat colnames cv = bw MSPE + /* mpse contains missing value*/ + forvalues i =1(1)`grid'{ + if(el(kernel,`i',1)==.){ + display as txt "MPSE contains missing value" + continue, break + } + } + **mat list cv display as txt "The optimal bandwidth is" as res %9.4f `bw' cap mat drop kernel gs @@ -325,7 +349,7 @@ version 12 /*******calculation *********/ if("`type'"=="kernel"){ - + if ("`vce'"!="bootstrap") { forvalues v = 1(1)`neval'{ tempvar xx wei dx @@ -333,19 +357,26 @@ version 12 qui: gen `dx' = `xx'*`d' qui: gen `wei' = normalden(`xx'/`bw') if("`fe'"==""){ - qui: reg `var' `xx' `dx' [weight = `wei'], vce(`vce') + cap qui: reg `var' `xx' `dx' [weight = `wei'], vce(`vce') } else { - qui: reghdfe `var' `xx' `dx' [weight = `wei'], /// + cap qui: reghdfe `var' `xx' `dx' [weight = `wei'], /// absorb(`fe') vce(`vce') } mat margeff[`v',1] = el(eval,1,`v') - mat coe = e(b) - mat vcov = e(V) - mat margeff[`v',2] = el(coe,1,1) - mat margeff[`v',3] = sqrt(el(vcov,1,1)) - mat margeff[`v',4] = el(coe,1,1)-`=quant'*sqrt(el(vcov,1,1)) - mat margeff[`v',5] = el(coe,1,1)+`=quant'*sqrt(el(vcov,1,1)) + if(_rc==0){ + mat coe = e(b) + mat vcov = e(V) + mat margeff[`v',2] = el(coe,1,1) + mat margeff[`v',3] = sqrt(el(vcov,1,1)) + mat margeff[`v',4] = el(coe,1,1)-`quant'*sqrt(el(vcov,1,1)) + mat margeff[`v',5] = el(coe,1,1)+`quant'*sqrt(el(vcov,1,1)) + } + else { + forvalues i =2(1)5{ + mat margeff[`v',`i'] =. + } + } qui count if `x' >= margeff[`v', 1] & `x' < (margeff[`v', 1] + `step') mat margeff[`v', 6] = `r(N)' if(`binary'==1){ @@ -353,7 +384,7 @@ version 12 mat margeff[`v', 7] = `r(N)' } drop `xx' `wei' `dx' - mat drop coe vcov + cap mat drop coe vcov } } // end non-bootstrap @@ -371,13 +402,18 @@ version 12 qui: reg `var' `xx' `dx' [weight = `wei'] } else { - qui: reghdfe `var' `xx' `dx'[weight = `wei'], absorb(`fe') + cap qui: reghdfe `var' `xx' `dx'[weight = `wei'], absorb(`fe') } - cap drop `xx' `wei' `dx' + drop `xx' `wei' `dx' mat margeff[`v',1] = el(eval,1,`v') - mat coe = e(b) - mat margeff[`v',2] = el(coe,1,1) - mat drop coe + if("`fe'"==""|_rc==0){ + mat coe = e(b) + mat margeff[`v',2] = el(coe,1,1) + mat drop coe + } + else{ + mat margeff[`v',2] = . + } mat bootce = J(`reps',1,.) // preserve and restore @@ -386,19 +422,26 @@ version 12 qui: gen `xx' = `x'- el(eval,1,`v') qui: gen `dx' = `xx'*`d' qui: gen `wei' = normalden(`xx'/`bw') - qui myboot `var' `xx' `dx'[weight = `wei'], fe(`fe') cluster(`cluster') - mat coe = r(coe) - mat bootce[`j',1] = el(coe,1,1) + cap qui myboot `var' `xx' `dx'[weight = `wei'], fe(`fe') cluster(`cluster') + if(_rc==0){ + mat coe = r(coe) + mat bootce[`j',1] = el(coe,1,1) + } + else{ + dis as err "bootstrap results don't converge" + exit 430 + } cap drop `xx' `wei' `dx' cap mat drop coe + } mata:bce=st_matrix("bootce") mata:se=sqrt(variance(bce)) mata:st_numscalar("r(se)",se) local bse = r(se) mat margeff[`v',3] = `bse' - mat margeff[`v',4] = el(margeff,`v',2)-`=quant'*`bse' - mat margeff[`v',5] = el(margeff,`v',2)+`=quant'*`bse' + mat margeff[`v',4] = el(margeff,`v',2)-`quant'*`bse' + mat margeff[`v',5] = el(margeff,`v',2)+`quant'*`bse' mat drop bootce qui count if `x' >= margeff[`v', 1] & `x' < (margeff[`v', 1] + `step') @@ -424,8 +467,8 @@ version 12 mat margeff[`i', 1] = `xmin' + (`i'-1)* `step' mat margeff[`i', 2] = `coefD' + `coefDX' * margeff[`i',1] mat margeff[`i', 3] = sqrt(`varD' + margeff[`i',1]^2 * `varDX' + `covDX' * margeff[`i',1] * 2) - mat margeff[`i', 4] = margeff[`i', 2] - `=quant' * margeff[`i', 3] - mat margeff[`i', 5] = margeff[`i', 2] + `=quant' * margeff[`i', 3] + mat margeff[`i', 4] = margeff[`i', 2] - `quant' * margeff[`i', 3] + mat margeff[`i', 5] = margeff[`i', 2] + `quant' * margeff[`i', 3] /* est +/- 1.96 s.e. only work for relatively large samples */ count if `x' >= margeff[`i', 1] & `x' < (margeff[`i', 1] + `step') mat margeff[`i', 6] = `r(N)' @@ -453,12 +496,12 @@ version 12 } } else { - scalar binsize = wordcount("`cutoffs'") - local nbins = `=binsize'+1 + local binsize = wordcount("`cutoffs'") + local nbins = `binsize'+1 mat cuts = J(`nbins'+1, 1, .) mat cuts[1,1] = `xmin' local k = 1 - qui forvalues i=1/`=binsize' { + qui forvalues i=1/`binsize' { gettoken xx cutoffs: cutoffs if (`xx'>`xmin' & `xx'<`xmax') { mat cuts[(`k'+1),1] = `xx' @@ -493,22 +536,22 @@ version 12 if("`vce'"!="bootstrap"){ if("`fe'"==""){ - qui: reg `y' `binvar' `varlist' `wgt', /// + qui: reg `y' `binvar' `z' `wgt', /// noconstant vce(`vce') } else{ - qui: reghdfe `y' `binvar' `varlist' `wgt', /// + qui: reghdfe `y' `binvar' `z' `wgt', /// absorb(`fe') vce(`vce') } } else { // bootstrap if("`fe'"==""){ qui: bootstrap, reps(`reps') cl(`cluster'): reg `y' `binvar' /// - `varlist', noconstant + `z', noconstant } else { qui: bootstrap, reps(`reps') cl(`cluster'): reghdfe `y' `binvar' /// - `varlist', absorb(`fe') + `z', absorb(`fe') } } /* do not forget to put in the same set of control variables */ @@ -519,8 +562,8 @@ version 12 qui forvalues i = 1/`nbins' { mat MEbin[`i',2] = coefs[1,1+4*(`i'-1)] mat MEbin[`i',3] = sqrt(vcov[1+4*(`i'-1),1+4*(`i'-1)]) - mat MEbin[`i', 4] = MEbin[`i', 2] - `=quant' * MEbin[`i', 3] - mat MEbin[`i', 5] = MEbin[`i', 2] + `=quant' * MEbin[`i', 3] + mat MEbin[`i', 4] = MEbin[`i', 2] - `quant' * MEbin[`i', 3] + mat MEbin[`i', 5] = MEbin[`i', 2] + `quant' * MEbin[`i', 3] } /* wald test */ @@ -533,22 +576,22 @@ version 12 g `dx' = `d'*`x' if ("`vce'"!="bootstrap"){ if("`fe'"==""){ - qui: reg `y' `d' `x' `dx' `waldvar' `varlist' `wgt', /// + qui: reg `y' `d' `x' `dx' `waldvar' `z' `wgt', /// vce(`vce') } else{ - qui: reghdfe `y' `d' `x' `dx' `waldvar' `varlist' `wgt', /// + qui: reghdfe `y' `d' `x' `dx' `waldvar' `z' `wgt', /// absorb(`fe') vce(`vce') } } else { // bootstrap if("`fe'"==""){ qui: bootstrap, reps(`reps') cl(`cluster'): reg `y' `d' `x' `dx' `waldvar' /// - `varlist' + `z' } else { qui: bootstrap, reps(`reps') cl(`cluster'): reghdfe `y' `d' `x' `dx' `waldvar' /// - `varlist', absorb(`fe') + `z', absorb(`fe') } } local wcount = wordcount("`waldvar'") @@ -828,11 +871,12 @@ version 12 } } else{ - qui reghdfe `varlist' `wgt',absorb(`fe') + qui reghdfe `varlist' `wgt',absorb(`fe') } mat coe = e(b) restore return mat coe=coe + end *********************************************************mata version 12