######################## # Lecture 6 # Compled Hypothesis Testing and Interaction # POLS 509: The Linear Model # Dr. Justin Esarey ######################## rm(list=ls()) set.seed(20985) # Power Analysis ######################## null<-seq(from=-3, to=3, by=0.01) pdf.of.null<-dnorm(null, mean=0, sd=1) plot(pdf.of.null~null, type="l", main="Distribution of beta hat under null", xlab="beta hat", ylab="f(b.hat)") abline(v=1.5, lty=2) abline(v=0, lty=3) shade.x<-c(1.51,seq(from=1.51, to=3, by=0.01),3) shade.y<-c(0, dnorm(seq(from=1.51, to=3, by=0.01), mean=0, sd=1)-0.00075,0) polygon(shade.x, shade.y, border=NA, col="gray") legend("topleft", lty=c(3,2), legend=c("null beta", "est. beta-hat")) dev.copy2eps(file="null.eps") null<-seq(from=-3, to=3, by=0.01) pdf.of.null<-dnorm(null, mean=1.5, sd=1) plot(pdf.of.null~null, type="l", main="Distribution of beta hat under alternative", xlab="beta hat", ylab="f(b.hat)") shade.x<-c(-3,seq(from=-3, to=1.65, by=0.01),1.65) shade.y<-c(0, dnorm(seq(from=-3, to=1.65, by=0.01), mean=1.5, sd=1)-0.00075,0) polygon(shade.x, shade.y, border=NA, col="gray") abline(v=1.65, lty=2) abline(v=1.5, lty=3) legend("topleft", lty=c(3,2), legend=c("est. beta-hat", "critical b(t) for significance")) dev.copy2eps(file="alternative.eps") null<-seq(from=-3, to=3, by=0.01) pdf.of.null<-dnorm(null, mean=0, sd=1) pdf.of.alt<-dnorm(null, mean=1.5, sd=1) plot(1, type="n", axes=T, main="Power/Size Tradeoff", xlim=c(-3,3), ylim=c(0, 0.4), xlab="beta hat", ylab="f(b.hat)") shade.x<-c(1.96,seq(from=1.96, to=3, by=0.01),3) shade.y<-c(0, dnorm(seq(from=1.96, to=3, by=0.01), mean=0, sd=1),0) polygon(shade.x, shade.y, border=NA, col="coral") shade.x<-c(-3,seq(from=-3, to=1.96, by=0.01),1.96) shade.y<-c(0, dnorm(seq(from=-3, to=1.96, by=0.01), mean=1.5, sd=1),0) polygon(shade.x, shade.y, border=NA, col="cadetblue") lines(pdf.of.null~null, col="red") lines(pdf.of.alt~null, col="blue") legend("topleft", col=c("red", "blue", "black"), lty=c(1,1,2), legend=c("f(b.hat under null)","f(b.hat|b.hat=1.5)", "critical b(t)=1.96")) abline(v=1.96, lty=2) dev.copy2eps(file="power-size.eps") null<-seq(from=-3, to=3, by=0.01) pdf.of.null<-dnorm(null, mean=0, sd=1) pdf.of.alt<-dnorm(null, mean=1.5, sd=1) plot(1, type="n", axes=T, main="Power/Size Tradeoff", xlim=c(-3,3), ylim=c(0, 0.4), xlab="beta hat", ylab="f(b.hat)") shade.x<-c(1.645,seq(from=1.645, to=3, by=0.01),3) shade.y<-c(0, dnorm(seq(from=1.645, to=3, by=0.01), mean=0, sd=1),0) polygon(shade.x, shade.y, border=NA, col="coral") shade.x<-c(-3,seq(from=-3, to=1.645, by=0.01),1.645) shade.y<-c(0, dnorm(seq(from=-3, to=1.645, by=0.01), mean=1.5, sd=1),0) polygon(shade.x, shade.y, border=NA, col="cadetblue") lines(pdf.of.null~null, col="red") lines(pdf.of.alt~null, col="blue") legend("topleft", col=c("red", "blue", "black"), lty=c(1,1,2), legend=c("f(b.hat under null)","f(b.hat|b.hat=1.5)", "critical b(t)=1.645")) abline(v=1.645, lty=2) dev.copy2eps(file="power-size2.eps") null<-seq(from=-3, to=3, by=0.01) pdf.of.null<-dnorm(null, mean=0, sd=1) pdf.of.alt<-dnorm(null, mean=1.5, sd=1) plot(1, type="n", axes=T, main="Power/Size Tradeoff", xlim=c(-3,3), ylim=c(0, 0.4), xlab="beta hat", ylab="f(b.hat)") shade.x<-c(1,seq(from=1, to=3, by=0.01),3) shade.y<-c(0, dnorm(seq(from=1, to=3, by=0.01), mean=0, sd=1),0) polygon(shade.x, shade.y, border=NA, col="coral") shade.x<-c(-3,seq(from=-3, to=1, by=0.01),1) shade.y<-c(0, dnorm(seq(from=-3, to=1, by=0.01), mean=1.5, sd=1),0) polygon(shade.x, shade.y, border=NA, col="cadetblue") lines(pdf.of.null~null, col="red") lines(pdf.of.alt~null, col="blue") legend("topleft", col=c("red", "blue", "black"), lty=c(1,1,2), legend=c("f(b.hat under null)","f(b.hat|b.hat=1.5)", "critical b(t)=1")) abline(v=1, lty=2) dev.copy2eps(file="power-size3.eps") # example power analysis n<-50 x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) y<-2*x+1.5*z+1+rnorm(n, mean=0, sd=1) model<-lm(y~x+z) summary(model) summary(model)$coefficients pt(1.96-summary(model)$coefficients[1,3], df=n-3) # 34.6% probability of false negatives # Asymptotic Testing ######################## set.seed(23123) # start with a tiny sample, bimodal error p.store.x<-c() p.store.z<-c() n<-10 sims=5000 pb<-txtProgressBar(min=0,max=sims,initial=0,style=3) for(i in 1:sims){ setTxtProgressBar(pb,value=i) x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) u<-rnorm(n, mean=1.5, sd=0.5)*sign(runif(n,min=-0.5,max=0.5)) # bimodal error, mean zero y<-2*x+1+u model<-lm(y~x+z) p.store.x[i]<-summary(model)$coefficients[2,4] p.store.z[i]<-summary(model)$coefficients[3,4] } close(pb) sum(p.store.x<0.05)/sims sum(p.store.z<0.05)/sims # tiny sample, uniform error p.store.x<-c() p.store.z<-c() n<-10 sims=5000 pb<-txtProgressBar(min=0,max=sims,initial=0,style=3) for(i in 1:sims){ setTxtProgressBar(pb,value=i) x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) u<-runif(n,min=-2,max=2) y<-2*x+1+u model<-lm(y~x+z) p.store.x[i]<-summary(model)$coefficients[2,4] p.store.z[i]<-summary(model)$coefficients[3,4] } close(pb) sum(p.store.x<0.05)/sims sum(p.store.z<0.05)/sims # tiny sample, F-distributed error p.store.x<-c() p.store.z<-c() n<-10 sims=5000 pb<-txtProgressBar(min=0,max=sims,initial=0,style=3) for(i in 1:sims){ setTxtProgressBar(pb,value=i) x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) u<-rf(n, df1=1, df2=100)-(100/98) # f-distributed error, mean zero y<-2*x+1+u model<-lm(y~x+z) p.store.x[i]<-summary(model)$coefficients[2,4] p.store.z[i]<-summary(model)$coefficients[3,4] } close(pb) sum(p.store.x<0.05)/sims sum(p.store.z<0.05)/sims # now a slightly larger sample p.store.x<-c() p.store.z<-c() sims<-5000 n<-100 pb<-txtProgressBar(min=0,max=sims,initial=0,style=3) for(i in 1:sims){ setTxtProgressBar(pb,value=i) x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) u<-rf(n, df1=1, df2=100)-(100/98) # f-distributed error, mean zero y<-2*x+1+u model<-lm(y~x+z) p.store.x[i]<-summary(model)$coefficients[2,4] p.store.z[i]<-summary(model)$coefficients[3,4] } close(pb) sum(p.store.x<0.05)/sims sum(p.store.z<0.05)/sims # try an "infinite" sample p.store.x<-c() p.store.z<-c() sims<-5000 n<-1000 pb<-txtProgressBar(min=0,max=sims,initial=0,style=3) for(i in 1:sims){ setTxtProgressBar(pb,value=i) x<-runif(n, min=0, max=10) z<-runif(n, min=0, max=10) u<-rf(n, df1=1, df2=100)-(100/98) # f-distributed error, mean zero y<-2*x+1+u model<-lm(y~x+z) p.store.x[i]<-summary(model)$coefficients[2,4] p.store.z[i]<-summary(model)$coefficients[3,4] } close(pb) sum(p.store.x<0.05)/sims sum(p.store.z<0.05)/sims # tests of complex hypotheses ############################### rm(list=ls()) set.seed(4589) n<-100 x<-c(rep(0, 33), rep(1, 33), rep(0, 34)) z<-c(rep(0, 33), rep(0, 33), rep(1, 34)) y<-2*x+1.5*z+1+rnorm(n, mean=0, sd=1) data<-as.data.frame(cbind(y,x,z)) model<-lm(y~x+z) # Approach 1: t-test ############### # get the VCV of the model vcv.model<-vcov(model) # test B1-B2=0 using t-test t.stat<-(model$coefficients[2]-model$coefficients[3])/sqrt(vcv.model[2,2]+vcv.model[3,3]-2*vcv.model[2,3]) p.val<-1-pt(t.stat, df=97) cat("t-statistic: ", t.stat, "\n") cat("two-tailed p-value: ", 2*p.val, "\n") # print standard error sqrt(vcv.model[2,2]+vcv.model[3,3]-2*vcv.model[2,3]) # Approach 2: f-test ############### # model 1: unrestricted model<-lm(y~x+z) # model 2: restricted X<-x|z model2<-lm(y~X) anova(model,model2) # Approach 3: simulation out of the asymptotic distribution ############### require(mvtnorm) draws<-rmvnorm(1000, mean=model$coefficients, sigma=vcv.model) # show you what this looks like plot(draws[,2]~draws[,3]) difs<-draws[,2]-draws[,3] plot(density(difs), xlab="beta_x - beta_z", main="Density of b_x - b_z") quantile(difs, probs=c(0.025, 0.975)) # Approach 4: nonparametric bootstrap ############### set.seed(21399) beta.boot<-matrix(data=NA, nrow=1000, ncol=3) v<-1:length(y) for(i in 1:1000){ j<-sample(v, size=length(v), replace=T) data.samp<-data[j,] beta.boot[i,]<-lm(data.samp[,1]~as.matrix(data.samp[,2:3]))$coefficients } apply(beta.boot, MARGIN=2, FUN=mean) difs<-beta.boot[,2]-beta.boot[,3] quantile(difs, probs=c(0.025, 0.975)) ############################# # Interaction Model Analysis ############################# rm(list=ls()) set.seed(2347823) # make some data X<-runif(50, min=-10, max=10) Z<-runif(50, min=-10, max=10) XZ<-X*Z const<-rep(1, 50) b<-c(0.123, 2.542, -2.843, -0.485) y<-cbind(const, X, Z, XZ)%*%b+rnorm(50, mean=0, sd=15) # save this data require(foreign) data<-as.data.frame(cbind(y, X, Z)) names(data)<-c("y", "X", "Z") write.dta(data, file="lecture6interaction.dta") # analyze data from scratch ############################# # clear memory rm(list=ls()) # read in data data<-read.dta(file="lecture6interaction.dta") attach(data) # run a linear regression model<-lm(y~X*Z) summary(model) # coefficient on X*Z is sufficient to test whether X changes the relationship between y and Z (and vice versa) # but how do we test for the statistical significance of the relationship between y and X (or y and Z)? # recover the coefficients from the model coef<-coefficients(model) # recover the vcv matrix vcv<-vcov(model) # create a two panel plot par(mfrow=c(1,2)) # plot derivative values Z.fits<-seq(from=-10, to=10, by=0.01) # this is the main effect dYdX<-coef[2]+Z.fits*coef[4] plot(dYdX~Z.fits, type="l", ylim=c(-6, 12), xlab="Z", ylab="dY/dX", main="Analytic Calculation") # what is the standard error of this quantity? # exact calculation # fixed the standard error here, which used the wrong formula at first dYdX.se<-sqrt(vcv[2,2]+Z.fits^2*vcv[4,4]+2*(Z.fits)*vcv[2,4]) # 95% CI lines(dYdX+1.96*dYdX.se~Z.fits, lty=2) lines(dYdX-1.96*dYdX.se~Z.fits, lty=2) # show the zero line abline(h=0, lty=3) # draw out of the asymptotic distribution of betas, fixed sigma (hat) plot(dYdX~Z.fits, type="l", ylim=c(-6, 12), xlab="Z", ylab="dY/dX", main="Simulation") require(mvtnorm) beta.draws<-rmvnorm(1000, mean=coef, sigma=vcv) # note: mean and vcv come out of regression results dYdX.draws<-beta.draws[,2]+as.matrix(Z.fits)%*%beta.draws[,4] dYdX.lo<-apply(dYdX.draws, MARGIN=1, FUN=quantile, probs=0.025) dYdX.hi<-apply(dYdX.draws, MARGIN=1, FUN=quantile, probs=0.975) # plot the 95% CI of dYdX lines(dYdX.lo~Z.fits, lty=2) lines(dYdX.hi~Z.fits, lty=2) # show the zero line abline(h=0, lty=3) # some variation - could smooth this out... redo all the plots plot(dYdX~Z.fits, type="l", ylim=c(-6, 12), xlab="Z", ylab="dY/dX", main="Analytic Calculation") lines(dYdX+1.96*dYdX.se~Z.fits, lty=2) lines(dYdX-1.96*dYdX.se~Z.fits, lty=2) abline(h=0, lty=3) plot(dYdX~Z.fits, type="l", ylim=c(-6, 12), xlab="Z", ylab="dY/dX", main="Simulation") # now use loess smoothing dYdX.lo.loess<-loess(dYdX.lo~Z.fits) dYdX.lo.smooth<-predict(dYdX.lo.loess, newdata=Z.fits) dYdX.hi.loess<-loess(dYdX.hi~Z.fits) dYdX.hi.smooth<-predict(dYdX.hi.loess, newdata=Z.fits) lines(dYdX.lo.smooth~Z.fits, lty=2) lines(dYdX.hi.smooth~Z.fits, lty=2) # add a legend legend("topright", legend=c("dY/dX", "95% CI"), lty=c(1,2)) # show the zero line abline(h=0, lty=3) # squared terms ############################ rm(list=ls) set.seed(3249023) n<-100 x<-runif(n, min=-10, max=10) xsq<-x^2 y<-1+x+0.15*xsq+rnorm(n, mean=0, sd=1) plot(y~x, xlim=c(-10,10)) xx<-seq(from=-10, to=10, by=0.1) xxsq<-xx^2 yy<-1+xx+0.15*xxsq lines(yy~xx)