# The role of assumptions in data analysis # Dr. Justin Esarey # Rice University # # January 13, 2013 ################################################## # example 1: the linear model is a good fit rm(list=ls()) set.seed(123912) education<-runif(200, min=0, max=7) education<-round(education) income<-8000+10000*education+rnorm(200, mean=0, sd=7000) plot(income~education, main="Fake Income and Education Data") dev.copy2eps(file="income_education.eps") # Loess prediction N-W plot(income~education, main="Nadaraya-Watson Smooth (Degree=0)") predict(loess(income~education, deg=0, span=0.25), 0:7, se=F)->np.income lines(0:7,np.income) dev.copy2eps(file="income_education_loess.eps") plot(income~education, main="Fake Income and Education Data") newdat<-as.data.frame(0:7) names(newdat)<-"education" predict(lm(income~education), newdata=newdat, se.fit=T)$fit->lm.fit lines(0:7, lm.fit,lwd=1, col="red") predict(lm(income~education), newdata=newdat, se.fit=T)$se.fit->lm.se lines(0:7,lm.fit+1.96*lm.se, col="red", lty=2) lines(0:7,lm.fit-1.96*lm.se, col="red", lty=2) lines(0:7,np.income) legend("bottomright", lty=c(1,1), col=c("black","red"), legend=c("Local Linear","OLS")) dev.copy2eps(file="linear_regression.eps") library(np) bw.calc<-npregbw(income~education,regtype="lc") np.2d.pred<-npreg(bw.calc)$merr plot(np.2d.pred~income, ylim=c(0,2000), ylab="SE of predicted income") predict(lm(income~education), se.fit=T)$se.fit->lm.se points(lm.se~income, col="red") # curse of dimensionality require(scatterplot3d) parent.income<-runif(200, min=5000, max=100000) income<-4000+10000*education+0.4*parent.income+rnorm(200, mean=0, sd=9000) plot<-scatterplot3d(cbind(education, parent.income,income), angle=45, scale.y=1) plot$plane3d(lm(income~education+parent.income)) library(np) bw.calc<-npregbw(income~parent.income+education,regtype="lc") np.2d.pred<-npreg(bw.calc)$merr plot(np.2d.pred~income, ylim=c(0,5000)) predict(lm(income~education+parent.income), se=T)->lm.2d.pred points(lm.2d.pred$se.fit~income, col="red") # example 2: the linear model is a poor fit education<-runif(200, min=0, max=7) education<-round(education) income<-40000+0.6*(sin((0.7)*education-2))*30000+rnorm(200, mean=0, sd=7000) plot(income~education, main="Fake Income and Education Data") dev.copy2eps(file="income_education_2.eps") plot(income~education, main="Nadaraya-Watson Smooth (Degree=0)") predict(loess(income~education, deg=0, span=0.25), 0:7, se=F)->np.income lines(0:7,np.income) dev.copy2eps(file="income_education_loess_2.eps") plot(income~education, main="Fake Income and Education Data") newdat<-as.data.frame(0:7) names(newdat)<-"education" predict(lm(income~education), newdata=newdat, se.fit=T)$fit->lm.fit lines(0:7, lm.fit,lwd=1, col="red") predict(lm(income~education), newdata=newdat, se.fit=T)$se.fit->lm.se lines(0:7,lm.fit+1.96*lm.se, col="red", lty=2) lines(0:7,lm.fit-1.96*lm.se, col="red", lty=2) lines(0:7,np.income) legend("bottomright", lty=c(1,1), col=c("black","red"), legend=c("Local Linear","OLS"))