Andrew Gelman's Sciences Po Course: some R code for conference #2

### Conference de methode #2

library("arm")

###########
### 3.1 ###
###########
getwd()
setwd("/home/argon/ScPo")

# first try... does not work
#Pyth <- read.table("exercise2.1.dat")
#str(Pyth)
#Pyth

Pyth <- read.table("exercise2.1.dat", header=T)
str(Pyth)
Pyth

summary(Pyth[1:40,])

fit.Pyth <- lm(y ~ x1 + x2, data=Pyth[1:40,])
display(fit.Pyth)

## try to understand what kind of objects are fit.Pyth and coef(fit.Pyth)
attributes(fit.Pyth)
attributes(coef(fit.Pyth))
class(coef(fit.Pyth))
is.vector(coef(fit.Pyth))
# coef(fit.Pyth) is a named vector
coef(fit.Pyth)["x1"]
coef(fit.Pyth)[2]

## 2 graph windows instead of one
par(mfrow=c(1,2))

plot(Pyth$x1[1:40], Pyth$y[1:40], cex=.25)
# curve takes a function of 'x' as an argument
# here this function is given as the product of 2 vectors in the sense of the matrix multiplication
# the choice of the intercepts is explained below
curve(cbind(1,x,mean(Pyth$x2[1:40])) %*% coef(fit.Pyth), add=T )

plot(Pyth$x2[1:40], Pyth$y[1:40], cex=.25)
curve(cbind(1,mean(Pyth$x1[1:40]),x) %*% coef(fit.Pyth), add=T )

plot(Pyth$x1[1:40], Pyth$x2[1:40], cex=.25)

# residual plots versus x1, x2, and predicted values
summary(residuals(fit.Pyth))
str(residuals(fit.Pyth))

plot(Pyth$x1[1:40], residuals(fit.Pyth), cex=.25)
abline(c(0,0))
abline(c(sd(residuals(fit.Pyth)),0), lty=2)
abline(c(-sd(residuals(fit.Pyth)),0), lty=2)
# errors are bigger at both ends of x1

plot(Pyth$x2[1:40], residuals(fit.Pyth), cex=.25)
abline(c(0,0))
abline(c(sd(residuals(fit.Pyth)),0), lty=2)
abline(c(-sd(residuals(fit.Pyth)),0), lty=2)
# not very centered

plot(fitted.values(fit.Pyth)[1:40], residuals(fit.Pyth), cex=.25)
abline(c(0,0))
abline(c(sd(residuals(fit.Pyth)),0), lty=2)
abline(c(-sd(residuals(fit.Pyth)),0), lty=2)
# quite ok

pred.Pyth <- predict(fit.Pyth, Pyth[41:60,], interval="prediction", level=.95)
pred.Pyth

### choice of the intercepts for the two separate graphs of y against x1 and x2
# the graph of x1 against x2 as well as a regression of x1 on x2 seem to indicate that they are independant
# so it makes sense to use the coefficients of the regression of y against x1 and x2
# to draw two separate graphs of y on x1 and x2
# the slopes are given by the regression but the intercepts have to be recalculated
# for each graph we want the line to be centered.
#
# the fitted model is:
# $y_i = \alpha + \beta_1 x_{1i} + \beta_2 x_{2i} + \varepsilon_i$
#
# the estimated coefficients are $a, b_1, \text{~~and~~} b_2$
# graph of y against x1:
# the line is of the form:
# $y=b_1 x + c_1$
# we define
# $u_i = y_i - (b_1 x_{1i} + c_1 )$
# we want the line to be centered in the sense that $\frac{1}{n} \sum_{i=1}^n u_i=0$
#
# since there is a constant term in the regression, we know that:
# $\frac{1}{n}\sum_{i=1}^n y_i = a + b_1\frac{1}{n} \sum_{i=1}^n x_{1i} + b_2 \frac{1}{n}\sum_{i=1}^n x_{2i}$
# therefore:
# $\frac{1}{n}\sum_{i=1}^n y_i -b_1 \frac{1}{n}\sum_{i=1}^n x_{1i} = a + b_2 \frac{1}{n}\sum_{i=1}^n x_{2i}$
# and we want to find c1 such that
# $\frac{1}{n}\sum_{i=1}^n y_i -b_1 \frac{1}{n}\sum_{i=1}^n x_{1i} = c_1$
# therefore we take
# $c_1=a + b_2 \frac{1}{n}\sum_{i=1}^n x_{2i}$

# for the graph of y against x2, it's symmetrical

###########
### 3.2 ###
###########

#(a)
# l(earnings)=\alpha + \beta*log(height) + error
# alpha_hat + beta_hat * log(66)=log(30000)
# beta_hat=.8
log(30000)-.8*log(66)
# alpha_hat=6.96
# 2*sigma_hat=1.1
# sigma_hat=.55

###########
### 3.3 ###
###########

var1 <- rnorm(1000, 0,1)
var2 <- rnorm(1000, 0,1)
display(lm(var2 ~ var1))

z.scores <- rep(NA,1000)
for (k in 1:1000){
var1 <- rnorm(1000, 0,1)
var2 <- rnorm(1000, 0,1)
fit <- lm(var2 ~ var1)
z.scores[k] <- coef(fit)[2]/se.coef(fit)[2]
}

length(which(abs(z.scores)>2))

Andrew Gelman's Sciences Po Course

Friday, October 2, 2009

some R code for conference #2

No comments:

Post a Comment

Followers

Blog Archive

Contributors