# Regression Methods in Biostatistics, Vittinghoff, Glidden, Shiboski # and McCulloch (Springer, 2005) # # Chapter 6. Logistic Regression # # Time-stamp: <2008-08-26 20:46:21 chl> postscript(file="RMB_c6.ps") ############################################################################## # WCGS Sudy: Relationship between CHD and Age # loading the data WCGS <- read.table("wcgs.txt",header=TRUE,sep="\t") WCGS$chd69 <- factor(WCGS$chd69,labels=c("Noncases","Cases")) # plot the data stripchart(age~as.numeric(chd69),data=WCGS,method="jitter",pch="+", group.names=levels(WCGS$chd69),las=1,cex.axis=.8,cex=.8) title("WCGS Study: Coronary disease and Age") # set up the logistic model with default logit link WCGS.glm <- glm(chd69~age,data=WCGS,family=binomial()) summary(WCGS.glm) logLik(WCGS.glm) # the log-likelihood exp(coef(WCGS.glm)[2]*10) # OR associated with a ten-year increase in age library(MASS) confint(WCGS.glm) # 95% CI using profiled likelihood confint.default(WCGS.glm) # 95% CI based assuming asymptotic normality WCGS.glm.fit <- fitted(WCGS.glm) # fitted values # estimate the probability of having CHD for age=55 exp(-5.940+0.074*55)/(1+exp(-5.940+0.074*55)) # or, alternatively pred.df <- data.frame(age=c(55)) predict(WCGS.glm,pred.df,type="response",se=TRUE) # we could also do the computation the "hard way", but see # J Faraway, Extending the Linear Model with R. Generalized Linear, # Mixed Effects and Nonparametric Regression Models, Chapman & Hall/CRC, # 2006 # # to predict the reponse at age 55 x0 <- c(1,55) eta0 <- sum(x0*coef(WCGS.glm)) library(boot) inv.logit(eta0) # to get the 95% CI, we first need to extract the variance-covariance matrix (vcm <- summary(WCGS.glm)$cov.unscaled) se <- sqrt(t(x0) %*% vcm %*% x0) # SE on the logit scale inv.logit(eta0+c(-1,1)*qnorm(0.975)*se) # CI on the probability scale # R does not provide LR test in the summary output, but it could easily be # obtained from the Deviance Table, e.g. anova(WCGS.glm,test="Chisq") # See also lrm() in the Design package library(Design) lrm(chd69~age,data=WCGS) # for model adequacy checking, see residuals.lrm # e.g. resid(lrm(chd69~age,data=WCGS,x=T,y=T), "partial", pl="loess") # # Note: # The definitive reference for the Design package is # F. Harrell, Regression Modeling Strategies, Springer, 2001. # See Chapter 10 to 12 on binary logistic regression. # # assessing linearity in the relationship between CHD Risk and Age (p. 191) tab.chd69.age <- with(WCGS, table(chd69,age)) obs.logit <- logit(tab.chd69.age[2,]/apply(tab.chd69.age,2,sum)) plot(obs.logit~as.numeric(names(obs.logit)),pch=19,cex=.8,xlab="Age (years)", ylab="Logit CHD Risk",ylim=c(-4,-1)) abline(lm(obs.logit~as.numeric(names(obs.logit))),lty=2) lines(as.numeric(names(obs.logit)), predict(loess(obs.logit~as.numeric(names(obs.logit)))),lty=3) legend("bottom",c("Linear fit","LOWESS smooth"),lty=2:3,horiz=TRUE) # this suggests to test a model including a quadratic effect for age, # after centering it to reduce colinearity with the linear term summary(glm(chd69~age+I(scale(age,scale=F)^2),data=WCGS,family=binomial)) ############################################################################## # WCGS Study: CHD and Arcus Senilis (categorical predictor) WCGS$arcus <- factor(WCGS$arcus,labels=c("Unexposed","Exposed")) # we graphically display the Morbidity/Exposition table mosaicplot(t(table(WCGS$chd69,WCGS$arcus)),col=c(4,2),main="WCGS Study") WCGS.glm2 <- glm(chd69~arcus,data=WCGS,family=binomial()) summary(WCGS.glm2) # estimate OR and associated 95% CI library(epitools) oddsratio(table(WCGS$chd69,WCGS$arcus),method="wald") # Wald CI # Note that we could also use the epitab() function # estimate a model where predictor has multiple categories table(WCGS$chd69,WCGS$agec) WCGS.glm3 <- glm(chd69~as.factor(agec),data=WCGS,family=binomial()) summary(WCGS.glm3) # estimate OR, together with their SD and associated 95% CI WCGS.glm3.OR <- as.numeric(exp(coef(WCGS.glm3))[2:5]) # omit the intercept res <- matrix(NA,nr=4,nc=3) for (i in 2:5) { # we take the first level as the baseline test <- oddsratio(table(WCGS$chd69,WCGS$agec)[,c(1,i)],method="wald") res[i-1,] <- test$measure[2,] } colnames(res) <- c("OR","LB-CI","UB-CI") rownames(res) <- paste("agec",1:4,sep="_") print(res,digits=4) # suppose, we want to change reference level for age agec2 <- relevel(as.factor(WCGS$agec),ref="3") table(agec2) oddsratio(table(WCGS$chd69,agec2)[,c(1,5)],method="wald") # OR agec_4/agec_3 ############################################################################## # Multipredictor Model WCGS.glm4 <- glm(chd69~age+chol+sbp+bmi+smoke,data=WCGS,family=binomial()) summary(WCGS.glm4) # we plot the 95% CI for each coefficient # rather than using the above method with Wald CI, we use confint # (it works because each variable is either continuous or binary) WCGS.glm4.coeff <- coef(WCGS.glm4) WCGS.glm4.CI <- confint(WCGS.glm4) opar <- par(mar=c(5,6,4,2),las=1) plot(exp(WCGS.glm4.coeff)[2:6],1:5,xlim=c(0.5,2.5),axes=FALSE,pch=19, cex=1.2,ylab="",xlab="OR",main="WCGS Study") axis(1) axis(2,at=1:5,labels=names(WCGS.glm4.coeff)[2:6],tick=FALSE) segments(exp(WCGS.glm4.CI[2:6,1]),1:5,exp(WCGS.glm4.CI[2:6,2]),1:5,lwd=2) abline(v=1,lty=2,col="lightgray") par(opar) # now we scale the predictors # Note that age_c is a centered predictor scaled by a factor of 10 plot(WCGS$age_10,scale(WCGS$age,scale=F)/10) WCGS.glm5 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke,data=WCGS, family=binomial()) summary(WCGS.glm5) # next, we add a 4-levels predictor table(WCGS$behpat) WCGS.glm6 <- update(WCGS.glm5,.~.+as.factor(behpat)) summary(WCGS.glm6) exp(confint.default(WCGS.glm6)) # approximate normal-based 95% CI # the reduced model without behpat is compared to the above one anova(WCGS.glm6,WCGS.glm5,test="Chisq") # or, by hand 2*(logLik(WCGS.glm6)-logLik(WCGS.glm5)) # or, using lrtest from the Design package lrtest(WCGS.glm6,WCGS.glm5) # The lmtest package also contains a function called lrtest detach(package:Design) library(lmtest) lrtest(WCGS.glm6,WCGS.glm5) # or, equivalently # lrtest(WCGS.glm6,"behpat") # after replacing the 4-levels behpat by a binary version (dipbat), we get the # following results WCGS.glm7 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke+dibpat,data=WCGS, family=binomial()) summary(WCGS.glm7) exp(confint.default(WCGS.glm7)) # again, wa compare the two nested models anova(WCGS.glm7,WCGS.glm6,test="Chisq") ############################################################################## # WCGS Study: Confounding effect (Type A Behavior pattern) WCGS.glm8 <- glm(dibpat~age_10+chol_50+sbp_50+bmi_10+smoke,data=WCGS, family=binomial()) summary(WCGS.glm8) # adjusted ORs, omitting the intercept exp(coef(WCGS.glm8)[-1]) # unadjusted OR for dipbat/chd69 oddsratio(table(WCGS$chd69,WCGS$dibpat)) # while in the 7th model, it was exp(coef(WCGS.glm7)[["dibpat"]]) ## # we drop one at a time each predictor in the 7th model and estimate the ## # change in OR for dipbat/chd69 ## mod.pred <- "age_10+chol_50+sbp_50+bmi_10+smoke+dibpat" ## res <- numeric(lenth(pred.var)) ## for (i in 1:5) { ## reduced.model <- update(WCGS.glm7,.~.-unlist(strsplit(mod.pred,"+",fixed=T))[i]) ## res[i] <- exp(coef(reduced.model)[["dibpat"]]) ## } ############################################################################## # WCGS Study: Interaction effect (age under/over 50 vs. arcus senilis) with(WCGS, table(bage_50,arcus)) xtabs(~as.numeric(chd69)+as.numeric(bage_50)+as.numeric(arcus),data=WCGS) summary(xtabs(~as.numeric(chd69)+as.numeric(bage_50)+as.numeric(arcus),data=WCGS)) WCGS.glm9 <- glm(chd69~bage_50*arcus,data=WCGS,family=binomial()) summary(WCGS.glm9) exp(coef(WCGS.glm9)) # OR for coefficient bage_50 is the OR for age 50-59 vs. # age 39-49, for people without arcus # estimate OR for Arcus-Age interaction # (we use the contrast() function in the Design package) library(Design) WCGS$bage_50 <- as.factor(WCGS$bage_50) WCGS.glm9bis <- lrm(chd69~bage_50*arcus,data=WCGS) cc <- contrast(WCGS.glm9bis,list(bage_50='1',arcus='Exposed'), list(bage_50='0',arcus='Exposed')) # finally, we exponentiate the contrast and its 95% CI to work on the # probability scale exp(unlist(cc[2:5])) # we may also compared the full model with a model without interaction WCGS.glm10 <- update(WCGS.glm9,.~.-bage_50:arcus) anova(WCGS.glm10,WCGS.glm9,test="Chi") # if we now consider age as a continuous variable WCGS.glm11 <- glm(chd69~age*arcus,data=WCGS,family=binomial()) summary(WCGS.glm11) # estimate the log odds associated with having arcus for age=55 pred.df2 <- expand.grid(age=seq(min(WCGS$age),max(WCGS$age)), arcus=factor(levels(WCGS$arcus))) WCGS.glm11.pred <- predict(WCGS.glm11,pred.df2,se=TRUE) # as we expressed the predicted value on the link scale, we don't have to # take the log of the predicted values before substracting them OR.11 <- WCGS.glm11.pred$fit[pred.df2$age==55 & pred.df2$arcus=="Exposed"] - WCGS.glm11.pred$fit[pred.df2$age==55 & pred.df2$arcus=="Unexposed"] # thus, OR equals exp(OR.11) # we can also do this using the above contrast() function WCGS.glm11bis <- lrm(chd69~age*arcus,data=WCGS) cc2 <- contrast(WCGS.glm11bis,list(age=55,arcus='Exposed'), list(age=55,arcus='Unexposed')) exp(unlist(cc2[2:5])) # plot log odds as a function of age, separately for individuals with # and without arcus interaction.plot(pred.df2$age,pred.df2$arcus,WCGS.glm11.pred$fit,legend=FALSE, xlab="Age (years)",ylab="Log CHD Risk") legend("bottom",c("Unexposed","Exposed"),lty=1:2,horiz=TRUE) # compare the p-value for Age x Arcus with that obtained from an additive # model (i.e. where we model log(P) against a linear combination of predictors) WCGS.glm12 <- glm((as.numeric(chd69)-1)~bage_50*arcus,data=WCGS,family=poisson) summary(WCGS.glm12) # however, I cannot get the Wald test p-value of 0.15 as Vittinghoff et al. # here, it is estimated as 0.032 ############################################################################## # Prediction using an extended logistic model for CHD Events WCGS.glm13 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke+dibpat +bmi_10:chol_50+bmi_10:sbp_50, data=WCGS,family=binomial()) summary(WCGS.glm13) # Note that we get slighlty different estimates # predictions for the observed sample (5 individuals taken at random among the # wole dataset) set.seed(143) five.indiv <- sample(1:nrow(WCGS),5,replace=F) WCGS.glm13.pred <- predict(WCGS.glm13,type="response")[five.indiv] WCGS.five.indiv <- WCGS[five.indiv,c("chd69","age","chol","sbp","bmi","smoke","dibpat")] cbind(WCGS.five.indiv,WCGS.glm13.pred) # compute AUC and plot the AUC library(Epi) WCGS.glm13.predAll <- predict(WCGS.glm13,type="response") tmp <- cbind(predicted=WCGS.glm13.predAll,observed=WCGS$chd69[as.numeric(names(WCGS.glm13.predAll))]) ROC(tmp[,"predicted"],tmp[,"observed"]-1,plot="ROC") # diagnostic plots (as plotted pp. 189-191) # plot standardized residuals against obs. number plot(residuals(WCGS.glm13,type="pearson"),ylab="Standardized Pearson Residuals", xlab="Observation Number",pch=19,cex=.8) # dfbetas ifl.glm13 <- influence(WCGS.glm13,do.coef=FALSE) dfbetas.glm13 <- dfbetas(WCGS.glm13) # plot of the residuals vs. fitted values plot(residuals(WCGS.glm13,type="response")~WCGS.glm13.predAll,pch=19,cex=.8, xlab="Fitted values",ylab="Response Residuals") # Leverage effect hi <- influence(WCGS.glm13)$hat plot(hi~WCGS.glm13.predAll,xlab="Fitted values", ylab=expression(paste("Leverage (",h[i],")")),pch=19,cex=.8) hh <- which(hi>0.06) text(WCGS.glm13.predAll[hh],hi[hh],hh,cex=.8,pos=4) ############################################################################## # Case-Control Studies: The Ille-et-Villaine study of esopahgeal cancer # see e.g. http://www.ncbi.nlm.nih.gov/pubmed/861389 # # Note: # Data are not available on companion website. Fortunately, the are included # in the esoph R dataset. data(esoph) # recode tobacco consumption as binary variable esoph$ditob <- ifelse(as.numeric(esoph$tobgp)==1,"O-9g/day","10+ g/day") esoph$ditob <- factor(esoph$ditob,levels=c("O-9g/day","10+ g/day"),ordered=TRUE) with(esoph, tapply(ncases,ditob,sum)) # compute odds ratio for smoking and esophageal cancer # we first need to recode the case/control classification in the 'long' format # before using epitab() from epitools package # TODO... # runnig an GLM on this dataset (here age is categorical) esoph.glm <- glm(cbind(ncases,ncontrols)~alcgp+ditob+agegp, data=esoph,family=binomial) summary(esoph.glm) # # Note: # Vittinghoff et al. take age as a continuous variable (in years), but we # only have a categorical variable # To approximate their results, we convert agegp to a numerical variable, # after taking the mid-group value esoph$age.num <- strsplit(as.character(esoph$agegp),"-") for (i in 1:77) esoph$age.num[[i]] <- (as.numeric(esoph$age.num[[i]][2]) +as.numeric(esoph$age.num[[i]][1]))/2 esoph$age.num <- unlist(esoph$age.num) esoph$age.num[78:88] <- "75" esoph$age.num <- as.numeric(esoph$age.num) esoph.glm2 <- glm(cbind(ncases,ncontrols)~alcgp+ditob+age.num, data=esoph,family=binomial) summary(esoph.glm2) # Using this coding, we clearly don't the same results dev.off() # release the postscript device