Sunday, October 18, 2009

Some R code for conference #4

### conf #4

library("arm")
library ("foreign")

### a
# do not convert categorical variables into factors in order to have numerical variables for the analysis
Nes <- read.dta("nes5200_processed_voters_realideo.dta", convert.factors=F)

# same but conversion into factors in order to have the labels
Nes.fact <- read.dta("nes5200_processed_voters_realideo.dta")
str(Nes)
str(Nes.fact)

summary(Nes.fact)
#labels(Nes)["educ1"]

ok <- !is.na(Nes$age + Nes$female + Nes$black + Nes$educ1 + Nes$income + Nes$partyid7 + Nes$presvote + Nes$real_ideo) & Nes$year==1992 & Nes$presvote<3
Nes.red <- Nes[ok, c("age", "female", "black", "educ1", "income", "partyid7", "presvote", "real_ideo")]
summary(Nes.red)
summary(Nes.red$presvote[Nes.red$black==1])
table(Nes.red$income[Nes.red$presvote==1])
table(Nes.red$income[Nes.red$presvote==2])
table(Nes.red$presvote, Nes.red$partyid7)

Nes.red$presvote <- Nes.red$presvote -1

logit1 <- glm(presvote ~ income, family=binomial(link="logit"), data=Nes.red)
display(logit1)


logit2 <- glm(presvote ~ income + age + female + black + educ1 + partyid7 + real_ideo, family=binomial(link="logit"), data=Nes.red)
display(logit2, digits=4)

# drop partyid and female
# interact black, income and real_ideo with other variables
logit3 <- glm(presvote ~ age + black + educ1 + partyid7 + real_ideo +
black:educ1 + partyid7:educ1 + real_ideo:educ1 +
black:partyid7 + black:real_ideo + partyid7:real_ideo
, family=binomial(link="logit"), data=Nes.red)
display(logit3, digits=4)

logit4 <- glm(presvote ~ age + black + income + real_ideo + income:real_ideo
, family=binomial(link="logit"), data=Nes.red)
display(logit4, digits=4)

# standardize and refit to know the respective magnitudes

# fitted values -- 2 ways (at least)
# invlogit(predict(logit1))-logit1$fitted.values

# residual plot
resid <- logit4$y - logit4$fitted.values
plot(logit4$fitted.values, logit4$y - logit4$fitted.values)


# binned residual plot
# 40 bins of equal size
resid.quant <- quantile(logit4$fitted.values, probs=seq(0,1,length=41))
# is.vector(bin.resid)
length(resid.quant)
resid.quant
mean.bin <- rep(NA, 40)
for (k in 1:40){
mean.bin[k] <- mean(resid[logit4$fitted.values>=resid.quant[k] &logit4$fitted.values<=resid.quant[k+1] ])
}
plot(resid.quant[-1], mean.bin, ylim=c(-.3,.3))

# var.bin <- rep(NA, 40)
# for (k in 1:40){
# var.bin[k] <- var(logit4$fitted.values[logit4$fitted.values>=resid.quant[k] &logit4$fitted.values<=resid.quant[k+1] ])
# }

var.bin <- rep(NA, 40)
for (k in 1:40){
var.bin[k] <- mean((logit4$fitted.values[logit4$fitted.values>=resid.quant[k] &logit4$fitted.values<=resid.quant[k+1] ])*(1-logit4$fitted.values[logit4$fitted.values>=resid.quant[k] &logit4$fitted.values<=resid.quant[k+1] ]))
}


# fitted.sort <- sort(logit4$fitted.values)
# lines(fitted.sort, 2*sqrt(fitted.sort*(1-fitted.sort)/948*40))

lines(resid.quant[-1], 2*sqrt(var.bin/948*40))
lines(resid.quant[-1], -2*sqrt(var.bin/948*40))

No comments:

Post a Comment