Andrew Gelman's Sciences Po Course: some R code for Conference #1

##### install new packages in particular "arm"
chooseCRANmirror()
install.packages()

##### test
library("arm")
x <- c(1,2,3)
y <- c(1,2,5)
fit <- lm (y ~ x)
display (fit)

########################
##### exercise 2.4 #####
########################

mean.men <- 69.1
std.men <- 2.9

mean.women <- 63.7
std.women <- 2.7

# check syntax of function rnorm
?rnorm

# try the process once:
# take a sample of 100 men
# take a sample of 100 women
# take the means and the differences

sample.men <- rnorm(100, mean=mean.men, sd=std.men)
sample.women <- rnorm(100, mean=mean.women, sd=std.women)

mean(sample.men)
mean(sample.women)

## repeat the process 1000 times
# initialise an empty vector with 1000 missing values
x.y <- rep(NA,1000)
# check the length and the first 5 observations
length(x.y)
x.y[1:5]

# for loop to repeat the previous process
# each result is stored in one of the 1000 "spots" of the vector
for (i in 1:1000) {
x.y[i] <- mean(rnorm(100, mean=mean.men, sd=std.men))-mean(rnorm(100, mean=mean.women, sd=std.women))
}

# check the result
summary(x.y)
hist(x.y)
mean(x.y)
sd(x.y)

# theoretical values:
mean.men - mean.women
(sqrt(std.men^2 + std.women^2))/10

########################
##### exercise 3.4 #####
########################

# this library allows to read dataset in other formats (Stata, ...)
library("foreign")

# get the data on Andrew Gelman's web page
# http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
# store it somewhere
# place of the working directory
getwd()
# change your working directory if needed or include the full path to the dataset that you just saved
# (if you use the full path, use / instead of \ )
# setwd("C:/blabla/blablabla/")

iq.data <- read.dta("child.iq.dta")
# or:
# iq.data <- read.dta("C:/blabla/blablabla/child.iq.dta")

# check what's inside this dataset
str(iq.data)
summary(iq.data)

# fit the model
iq.fit <- lm(ppvt ~ momage, data=iq.data)
display(iq.fit)

# plot some stuff
plot(iq.data$momage, iq.data$ppvt, xlab="mother's age", ylab="kid's score", cex=.25)
abline(iq.fit)

# another fit with a quadratic function of momage
iq.fit2 <- lm(ppvt ~ momage + I(momage^2), data=iq.data)
display(iq.fit2)
# the corresponding plot
curve(cbind(1,x,x^2) %*% coef(iq.fit2), add=TRUE, col="red")

# check for heteroskedasticity
plot(iq.data$momage, resid(iq.fit))

# including mother's education
# several ways...

iq.data$momeduc <- as.factor(iq.data$educ_cat)
str(iq.data)
summary(iq.data)
iq.fit.m1 <- lm(ppvt ~ momage + momeduc, data=iq.data)
display(iq.fit.m1)

iq.fit.m2 <- lm(ppvt ~ momage + educ_cat, data=iq.data)
display(iq.fit.m2)

iq.fit.m3 <- lm(ppvt ~ momage + momeduc + momage:momeduc, data=iq.data)
display(iq.fit.m3)

iq.fit.m4 <- lm(ppvt ~ momeduc + momage:momeduc -1, data=iq.data)
display(iq.fit.m4)

colors <- ifelse(iq.data$momeduc==1, "black",
ifelse(iq.data$momeduc==2, "blue",
ifelse(iq.data$momeduc==3, "red", "green")
)
)
plot(iq.data$momage, iq.data$ppvt, xlab="mother's age", ylab="kid's score", cex=1, col=colors)

# unsolved problem with the colors, if anyone has any idea... (did not have time to debug this...)
par(mfrow=c(2,2))
for (i in 1:4){
plot(iq.data$momage[iq.data$momeduc==i], iq.data$ppvt[iq.data$momeduc==i], xlab="mother's age", ylab="kid's score", cex=1)
}

################################
##### end of the exercises #####
################################

### beauty and teaching evaluation
# create a graph with R which is consistent with the table

b <- 0.13
a <- 4.01
par(mfrow=c(1,1))
plot(c(-10,10), c(-10, 10), type="n")
abline(a,b)

curve(cbind(1,x) %*% c(a,b))

beauty.n <- 463
beauty.resid.sd <- 0.55
beauty.R2 <- .04

beauty.resid <- rnorm(beauty.n, mean=0, sd=beauty.resid.sd)
hist(beauty.resid)
sd(beauty.resid)

beauty.sd <- sd(beauty.resid)*sqrt(beauty.R2/(1-beauty.R2))/b
beauty.sd

sample.beauty <- rnorm(beauty.n, mean=0, sd=beauty.sd)
sample.eval <- a + b*sample.beauty + beauty.resid

plot(sample.beauty, sample.eval, cex=.25)
fit.beauty <- lm(sample.eval ~ sample.beauty)
display(fit.beauty)
abline(fit.beauty)

# you can add dashed lined for the standard deviation...

### Earnings and height
# create a graph with R which is consistent with the table

earnings.n <- 10000
sample.height.in <- rnorm(earnings.n, mean=69.1, sd=2.9)
sample.height.cm <- sample.height.in*2.54
hist(sample.height)
earnings.resid.eur.sd <- 13000
earnings.resid.eur <- rnorm(earnings.n, mean=0, sd=earnings.resid.eur.sd)
sd(earnings.resid.eur)
sample.earnings.eur <- 340*sample.height.cm -41000 + earnings.resid.eur
sample.earnings.usd <- sample.earnings.eur*1.5

plot(sample.height.cm, sample.earnings.eur, cex=.25)
fit.earnings.eur <- lm(sample.earnings.eur ~ sample.height.cm)
display(fit.earnings.eur)
abline(fit.earnings.eur)

plot(sample.height.in, sample.earnings.usd, cex=.25)
fit.earnings.usd <- lm(sample.earnings.usd ~ sample.height.in)
display(fit.earnings.usd)
abline(fit.earnings.eur)
41000*1.5
340*2.54*1.5
13000*1.5

Andrew Gelman's Sciences Po Course

Tuesday, September 22, 2009

some R code for Conference #1

No comments:

Post a Comment

Followers

Blog Archive

Contributors