Friday, December 18, 2009

Pieces of R code for conference #10

######################
### conference #10 ###
######################

library("arm")

############
### 10.2 ###
############

Bypass <- read.table("bypass.data.txt", header=T)
str(Bypass)

### b
logit1 <- glm(new ~ I(age<80)
, family=binomial(link="logit"), data=Bypass)

display(logit1)

logit2 <- glm(new ~ I(age<80) + severity
, family=binomial(link="logit"), data=Bypass)

display(logit2, digits=3)

logit3 <- glm(new ~ I(age<80) + severity
, family=binomial(link="logit"), data=Bypass, subset=(age>=70 & age <=90))

display(logit3, digits=3)

logit4 <- glm(new ~ I(age<80) + severity
, family=binomial(link="logit"), data=Bypass, subset=(age>=75 & age <=85))

display(logit4, digits=3)

invlogit(coef(logit4)[1] + coef(logit4)[2] + coef(logit4)[3]*mean(Bypass$severity[Bypass$age>=75 & Bypass$age<80]))
invlogit(coef(logit4)[1] + coef(logit4)[3]*mean(Bypass$severity[Bypass$age>=80 & Bypass$age<=85]))

invlogit(coef(logit4)[1] + coef(logit4)[2] + coef(logit4)[3]*mean(Bypass$severity[Bypass$age>=75 & Bypass$age<80]))*(1-invlogit(coef(logit4)[1]>=75 & Bypass$age<80])))

invlogit(coef(logit4)[1] + coef(logit4)[3]*mean(Bypass$severity[Bypass$age>=80 & Bypass$age<=85]))*(1-invlogit(coef(logit4)[1] + coef(logit4)[3]*mean(Bypass$severity[Bypass$age>=80 & Bypass$age<=85])))

lm1 <- lm(stay ~ severity, data=Bypass)
display(lm1)

lm2 <- lm(stay ~ severity + age, data=Bypass)
display(lm2)

lm3 <- lm(stay ~ severity + age + I(age^2), data=Bypass)
display(lm3)

plot(stay ~ severity, data=Bypass, subset=new==0, ylim=c(15,45))
points(stay ~ severity, pch=3, data=Bypass, subset=new==1)

plot(stay ~ age, data=Bypass, subset=new==0, ylim=c(15,45), xlim=c(50,90))
points(stay ~ age, pch=3, data=Bypass, subset=new==1)

plot(severity ~ age, data=Bypass, subset=(new==1), xlim=range(Bypass$age), ylim=range(Bypass$severity))
points(severity ~ age, data=Bypass, subset=(new==0), pch=3)


### BONUS c ###

# without severity
Bypass$age80 <- Bypass$age - 80

TE1 <- lm(stay ~ new + age80 + age80:new, data=Bypass)
display(TE1)


# using the age threshold as a local IV
library("sem")

RDD.tsls <- tsls(stay ~ new + age80 , ~ I(age<80) + age80:I(age<80) + age80 , data=Bypass)
summary(RDD.tsls)

RDD.tsls2 <- tsls(stay ~ new + age80 + age80:new, ~ I(age<80) + age80:I(age<80) + age80 , data=Bypass)
summary(RDD.tsls2)

RDD.tsls3 <- tsls(stay ~ new + age80 , ~ I(age<80) + age80 , data=Bypass)
summary(RDD.tsls3a)


# example of a corresponding Wald estimate
RDD.itt <- lm(stay ~ I(age<80) + age80 + age80:I(age<80), data=Bypass)
display(RDD.itt)

RDD.comply <- lm(new ~ I(age<80) + age80 + age80:I(age<80), data=Bypass)
display(RDD.comply)

RDD.Wald <- coef(RDD.itt)[2]/coef(RDD.comply)[2]
RDD.Wald


# using severity
TE <- lm(stay ~ new + severity + age, data=Bypass)
display(TE)


############
### 10.3 ###
############

### a

n <- 1000
ni <- 500

h0 <- rnorm(n, mean=69.1, sd=2.9)
h1 <- h0 + rnorm(n, mean=2, sd=.5)

milk0 <- rbinom(n, size=1, prob=.3)
milk1 <- ifelse(milk0==1, 1, rbinom(n, size=1, prob=.5))

never <- milk0==0 & milk1==0
always <- milk0==1 & milk1==1
complier <- milk0==0 & milk1==1

sum(never)
sum(always)
sum(complier)

voucher <- rep(c(1,0), c(ni, n-ni))

milk <- ifelse(voucher, milk1, milk0)
h <- ifelse(milk, h1, h0)


ITT <- lm(h ~ voucher)
display(ITT)

comply <- lm(milk ~ voucher)
display(comply)

LATE.Wald <- coef(ITT)[2]/coef(comply)[2]
LATE.Wald

library("sem")

LATE.tsls <- tsls(h ~ milk, ~ voucher )
summary(LATE.tsls)


### b
n.b <- 1000

never.b <- rep(c(1, 0, 0), c(100, 100, 800))
always.b <- rep(c(0, 1, 0), c(100, 100, 800))
complier.b <- rep(c(0, 0, 1), c(100, 100, 800))

h0.b <- rnorm(n.b, mean=69.1, sd=2.9)
h1.b <- ifelse(complier.b, h0.b + rnorm(n.b, mean=2, sd=.5), h0.b + rnorm(n.b, mean=1, sd=.5))

voucher.b <- rbinom(n.b, 1, .5)

milk.b <- ifelse(never.b, 0, ifelse(always.b, 1, ifelse(voucher.b, 1, 0)))
h.b <- ifelse(milk.b, h1.b, h0.b)


ITT.b <- lm(h.b ~ voucher.b)
display(ITT.b)

comply.b <- lm(milk.b ~ voucher.b)
display(comply.b)

LATE.Wald.b <- coef(ITT.b)[2]/coef(comply.b)[2]
LATE.Wald.b

LATE.tsls.b <- tsls(h.b ~ milk.b, ~ voucher.b )
summary(LATE.tsls.b)




### c
n.c <- 1000

never.c <- rep(c(1, 0, 0), c(400, 400, 200))
always.c <- rep(c(0, 1, 0), c(400, 400, 200))
complier.c <- rep(c(0, 0, 1), c(400, 400, 200))

h0.c <- rnorm(n.c, mean=69.1, sd=2.9)
h1.c <- ifelse(complier.c, h0.c + rnorm(n.c, mean=2, sd=.5), h0.c + rnorm(n.c, mean=1, sd=.5))

voucher.c <- rbinom(n.c, 1, .5)

milk.c <- ifelse(never.c, 0, ifelse(always.c, 1, ifelse(voucher.c, 1, 0)))
h.c <- ifelse(milk.c, h1.c, h0.c)


ITT.c <- lm(h.c ~ voucher.c)
display(ITT.c)

comply.c <- lm(milk.c ~ voucher.c)
display(comply.c)

LATE.Wald.c <- coef(ITT.c)[2]/coef(comply.c)[2]
LATE.Wald.c

LATE.tsls.c <- tsls(h.c ~ milk.c, ~ voucher.c )
summary(LATE.tsls.c)

Thursday, December 10, 2009

First 5 problems of practice final exam for Sciences Po course, for mardi, 15 dec, 2009

You may use the textbook and one feuille of notes (both sides). No other references. You can use a pocket calculator but no computer.

Also please note that all these examples are fake; they are not based on real data.

Below are the first five problems; I'll send more soon.

Problem 1.

1000 people are surveyed and asked how many political activities they have done in the previous year. 300 people say “none,” 300 say “one,” 200 say “two,” 100 say “three,” and 100 say “4 or more.”

Give an estimate and standard error of the average number of political activities done by people in the population. (Hint: to compute the standard error of the mean, you need to estimate the standard distribution of the responses. If you draw a histogram of the responses, that should help.)

Problem 2.

A survey is done asking people if they support government-supplied child care. 60% of the men in the survey answer yes (with a standard error of 5%), and 70% of the women answer yes (with a standard error of 5%).

Give an estimate and standard error for the difference in support between men and women.

Assuming the data were collected using a simple random sample, how many people responded to the survey?

How large would the sample size have to be for the difference in support between men and women to have a standard error of 1%?

Problem 3.

A survey is done asking people if they support affirmative action for ethnic minorities. 30% of whites say yes (with a standard error of 3%), 80% of blacks say yes (with a standard error of 5%), and70% of others say yes (with a standard error of 10%).

The population of interest is 70% white, 20% black, and 10% other. Assuming the survey is a simple random sample, give an estimate and standard error of the percentage of people in the population who support affirmative action for ethnic minorities.

Problem 4.

1100 people are surveyed: 500 say they voted for John McCain and 600 say they voted for Barack Obama. Of the McCain voters, 40% support the health-care bill that is being debated in Congress. Of the Obama voters, 70% support the bill. (People who did not vote in the past election are excluded from this study.)

Suppose you take the data (y=1 for supporting the bill and 0 for opposing it) and run a linear regression, predicting y from a variable called “obama,” which is 1 for people who say they voted for Obama and 0 for people who say they voted for McCain.

The regression output looks like this:

lm(formula = y ~ obama)
coef.est coef.se
(Intercept) 0.__ 0.__
obama 0.__ 0.__
---
n = ____, k = _
residual sd = 0.__, R-Squared = 0.__

Fill in the blanks.

Problem 5.

Students in a laboratory experiment are given the choice of which of two video games to play, a storybook adventure or a shoot-em-up game. A logistic regression is fit predicting the students’ choice (y=0 if they choose the adventure or 1 if they choose the shooting game), given their age in years. (The students’ ages in the experiment range from 18 to 25 years.)

The logistic regression output looks like this:

glm(formula = y ~ age, family = binomial(link = "logit"))
coef.est coef.se
(Intercept) 5.64 1.61
age -0.30 0.08
---
n = 230, k = 2
residual deviance = 263.5, null deviance = 281.0 (difference = 17.4)

From this model, what is the probability of choosing the shooting game if you are 20 years old?

Graph the fitted regression line (from x ranging from 18 to 25) , and put in a bunch of dots representing values of x and y that are consistent with the fitted line.

Wednesday, December 9, 2009

Research article to be discussed mardi 15 dec

Regression models for decision making: a cost-benefit analysis of incentives in telephone surveys. Journal of Business and Economic Statistics (2003). (Andrew Gelman, Matt Stevens, and Valerie Chan)

Saturday, December 5, 2009

some R code for conference #10

######################
### conference #10 ###
######################

library("arm")

###########
### 9.8 ###
###########


n.C <- 100
n.T <- 100

x.C <- rnorm(n.C, mean=10, sd=4)
x.T <- rnorm(n.T, mean=10, sd=4)

e.C <- rnorm(n.C, mean=0, sd=1)
e.T <- rnorm(n.C, mean=0, sd=1)

y.C <- 1 + 2*x.C + e.C

y.Ta <- 1 + 2*x.T + e.T
y.Tb <- 1 + 2*x.T + e.T + 10
y.Tc <- 1 + 2*x.T + e.T + .5*x.T

plot(x.C, y.C, pch=1)
abline(lm(y.C ~ x.C))

points(x.T, y.Ta, pch=3)
abline(lm(y.Ta ~ x.T))


plot(x.C, y.C, pch=1)
abline(lm(y.C ~ x.C))

points(x.T, y.Tb, pch=3)
abline(lm(y.Tb ~ x.T))

plot(x.C, y.C, pch=1)
abline(lm(y.C ~ x.C))

points(x.T, y.Tc, pch=3)
abline(lm(y.Tc ~ x.T))


###########
### 9.9 ###
###########

### a

n.C <- 100
n.T <- 100

x.C <- rnorm(n.C, mean=10, sd=4)
x.T1 <- rnorm(n.T, mean=20, sd=4)

e.C <- rnorm(n.C, mean=0, sd=1)
e.T <- rnorm(n.T, mean=0, sd=1)

y.C <- 1 + 2*x.C + e.C
y.T1 <- 1 + 2*x.T1 + e.T + 10

y.a <- c(y.C, y.T1)
x.a <- c(x.C, x.T1)
T.a <- rep(c(0,1), c(n.C, n.T))


fit1 <- lm(y.a ~ T.a)
display(fit1)

fit2 <- lm(y.a ~ T.a + x.a)
display(fit2)

plot(x.C, y.C, pch=1, xlim=c(0,40), ylim=c(0,70))
abline(lm(y.C ~ x.C))

points(x.T1, y.T1, pch=3)
abline(lm(y.T1 ~ x.T1))

abline(h=mean(y.C))
abline(h=mean(y.T1))


### b

y.C2 <- 1 + 2*x.C + .2*I(x.C^2) + e.C
y.T2 <- 1 + 2*x.T1 + .2*I(x.T1^2) + e.T + 10

y.b <- c(y.C2, y.T2)
x.b <- c(x.C, x.T1)
T.b <- rep(c(0,1), c(n.C, n.T))


plot(x.C, y.C2, pch=1, xlim=c(0,40), ylim=c(0,200))
points(x.T1, y.T2, pch=3)

abline(lm(y.C2 ~ x.C))
abline(lm(y.T2 ~ x.T1))

fit3 <- lm(y.b ~ T.b + x.b)
display(fit3)

fit3b <- lm(y.b ~ T.b + x.b + T.b*x.b)
display(fit3b)

fit4 <- lm(y.b ~ T.b + x.b + I(x.b^2))
display(fit4)

Wednesday, December 2, 2009

Homework assignment due mardi, 8 dec

1. Exercise 10.2: Regression discontinuity analysis: suppose you are trying to evaluate the effect of a new procedure for coronary bypass surgery that is supposed to help with the post-operative healing process. The new procedure is risky, however, and is rarely performed in patients who are over 80 years old. Data from this (hypothetical) example are displayed in Figure 10.10.

(a) Does this seem like an appropriate setting in which to implement a regression discontinuity analysis?

(b) The folder bypass contains data for this example: stay is the length of hospital stay after surgery, age is the age of the patient, and new is the indicator variable indicating that the new surgical procedure was used. Pre-operative disease severity (severity) was unobserved by the researchers but we have access to it for illustrative purposes. Can you find any evidence using these data that the regression discontinuity design is inappropriate?

2. Exercise 10.3: Instrumental variables: come up with a hypothetical example in which it would be appropriate to estimate treatment effects using an instrumental variables strategy. For simplicity, stick to an example with a binary instrument and binary treatment variable.

(a) Simulate data for this imaginary example assuming all the assumptions are met. Estimate the local average treatment effect (LATE) for the data by dividing the intent-to-treat effect by the percentage of compliers. Show that two-stage least squares yields the same point estimate.

(b) Now simulate data in which the exclusion restriction is not met (so, for instance, those whose treatment level is left unaffected by the instrument have a treatment effect of half the magnitude of the compliers) but the instrument is strong (say, 80% of the population are compliers) and see how far off your estimate is.

(c) Finally simulate data in which the exclusion restriction is violated in the same way, however, the instrument is weak (only 20% of the population are compliers) and see how far off your estimate is.

Monday, November 30, 2009

Research article to be discussed mardi 8 dec 2009

Can losing lead to winning?, by Jonah Berger and Devin Pope.

Also see discussion here.

Friday, November 27, 2009

Homework assignment due mardi, 1 dec

1. Exercise 9.7: Gain-score models: in the discussion of gain-score models in Section 9.3, we noted that if we include the pre-treatment measure of the outcome in a gain score model, the coefficient on the treatment indicator will be the same as if we had just run a standard regression of the outcome on the treatment indicator and the pre-treatment measure. Show why this is true.

2. Exercise 9.8: Assume that linear regression is appropriate for the regression of an outcome, y, on treatment indicator, T , and a single confounding covariate, x. Sketch hypothetical data (plotting y versus x, with treated and control units indicated by circles and dots, respectively) and regression lines (for treatment and control group) that represent each of the following situations:

(a) No treatment effect,

(b) Constant treatment effect,

(c) Treatment effect increasing with x.

3. Exercise 9.9: Consider a study with an outcome, y, a treatment indicator, T , and a single confounding covariate, x. Draw a scatterplot of treatment and control observations that demonstrates each of the following:

(a) A scenario where the difference in means estimate would not capture the true treatment effect but a regression of y on x and T would yield the correct estimate.

(b) A scenario where a linear regression would yield the wrong estimate but a nonlinear regression would yield the correct estimate.