Tuesday, September 22, 2009

R code for the height and earnings example

## Read & clean the data
# Data are at http://www.stat.columbia.edu/~gelman/arm/examples/earnings

library ("arm")
heights <- read.dta ("http://www.stat.columbia.edu/~gelman/arm/examples/earnings/heights.dta")
attach.all (heights)

# recode sex variable
male <- 2 - sex

# (for simplicity) remove cases with missing data
ok <- !is.na (earn+height+sex) & earn>0
heights.clean <- as.data.frame (cbind (earn, height, male)[ok,])
n <- nrow (heights.clean)
attach.all (heights.clean)
height.jitter.add <- runif (n, -.2, .2)

## Model fit
lm.earn <- lm (earn ~ height)
display (lm.earn)
sim.earn <- sim (lm.earn)
beta.hat <- coef(lm.earn)

## Figure 4.1 (left)
par (mar=c(6,6,4,2)+.1)
plot (height + height.jitter.add, earn, xlab="height", ylab="earnings", pch=20, mgp=c(4,2,0), yaxt="n", col="gray10",
main="Fitted linear model")
axis (2, c(0,100000,200000), c("0","100000","200000"), mgp=c(4,1.1,0))
for (i in 1:20){
curve (sim.earn$coef[i,1] + sim.earn$coef[i,2]*x, lwd=.5, col="gray", add=TRUE)}
curve (beta.hat[1] + beta.hat[2]*x, add=TRUE, col="red")

## Figure 4.1 (right)
par (mar=c(6,6,4,2)+.1)
plot (height + height.jitter.add, earn, xlab="height", ylab="earnings", pch=20, mgp=c(4,2,0), yaxt="n", col="gray10",
main="Fitted linear model",xlim=c(0,80),ylim=c(-200000,200000))
axis (2, c(-100000,0,100000), c("-100000","0","100000"), mgp=c(4,1.1,0))
for (i in 1:20){
curve (sim.earn$coef[i,1] + sim.earn$coef[i,2]*x, lwd=.5, col="gray", add=TRUE)}
curve (beta.hat[1] + beta.hat[2]*x, add=TRUE, col="red")


## Log transformation

log.earn <- log(earn)
earn.logmodel.1 <- lm(log.earn ~ height)
display(earn.logmodel.1)

# Figure 4.3

sim.logmodel.1 <- sim (earn.logmodel.1)
beta.hat <- coef (earn.logmodel.1)

par (mar=c(6,6,4,2)+.1)
plot (height + runif(n,-.2,.2), log.earn, xlab="height", ylab="log (earnings)", pch=20, yaxt="n", mgp=c(4,2,0), col="gray10",
main="Log regression, plotted on log scale")
axis (2, seq(6,12,2), mgp=c(4,1.1,0))
for (i in 1:20)
curve (sim.logmodel.1$coef[i,1] + sim.logmodel.1$coef[i,2]*x, lwd=.5, col="gray", add=TRUE)
curve (beta.hat[1] + beta.hat[2]*x, add=TRUE, col="red")

par (mar=c(6,6,4,2)+.1)
plot (height + runif(n,-.2,.2), earn, xlab="height", ylab="earnings", pch=20, yaxt="n", mgp=c(4,2,0), col="gray10",
main="Log regression, plotted on original scale")
axis (2, c(0,100000,200000), c("0","100000","200000"), mgp=c(4,1.1,0))
for (i in 1:20)
curve (exp(sim.logmodel.1$coef[i,1] + sim.logmodel.1$coef[i,2]*x), lwd=.5, col="gray", add=TRUE)
curve (exp(beta.hat[1] + beta.hat[2]*x), add=TRUE, col="red")

## Log-base-10 transformation

log10.earn <- log10(earn)
earn.log10model <- lm(log10.earn ~ height)
display(earn.log10model)

## Log scale regression model

earn.logmodel.2 <- lm(log.earn ~ height + male)
display(earn.logmodel.2)

## Including interactions

earn.logmodel.3 <- lm(log.earn ~ height + male + height:male)
display(earn.logmodel.3)

## Linear transformations

z.height <- (height - mean(height))/sd(height)
earn.logmodel.4 <- lm(log.earn ~ z.height + male + z.height:male)
display(earn.logmodel.4)

## Log-log model

log.height <- log(height)
earn.logmodel.5 <- lm(log.earn ~ log.height + male)
display(earn.logmodel.5)

No comments:

Post a Comment