R version 2.6.2 (2008-02-08)
Copyright (C) 2008 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R est un logiciel libre livré sans AUCUNE GARANTIE.
Vous pouvez le redistribuer sous certaines conditions.
Tapez 'license()' ou 'licence()' pour plus de détails.

R est un projet collaboratif avec de nombreux contributeurs.
Tapez 'contributors()' pour plus d'information et
'citation()' pour la façon de le citer dans les publications.

Tapez 'demo()' pour des démonstrations, 'help()' pour l'aide
en ligne ou 'help.start()' pour obtenir l'aide au format HTML.
Tapez 'q()' pour quitter R.


CWD =  /Users/chl/Notes/022_RMB/chap_3/chapter 

> # Regression Methods in Biostatistics, Vittinghoff, Glidden, Shiboski
> # and McCulloch (Springer, 2005)
> #
> # Chapter 3. Basic Statistical Methods
> #
> # Time-stamp: <2008-02-10 13:41:06 chl>
> 
> postscript(file="RMB_c3.ps")
> 
> ##############################################################################
> ### t-test
> 
> # first, we load the data 
> HERS <- read.table("hersdata.txt",header=TRUE,sep="\t")
> HERS.women.wht_diab <- subset(HERS,diabetes==0)
> HERS.women.wht_diab$exercise <- factor(HERS.women.wht_diab$exercise,labels=c("No","Yes"))
> 
> # before carrying out the t-test, we shall look at the data with the help of
> # some graphics
> boxplot(glucose~exercise,data=HERS.women.wht_diab)
> title(xlab="Exercise",ylab="Glucose Level (mg/dL)")
> 
> op <- par(mfrow=c(2,2))
> with(HERS.women.wht_diab, hist(glucose[exercise=="No"],
+                                main="Exercice: No",xlab="Glucose Level (mg/dL)"))
> with(HERS.women.wht_diab, hist(glucose[exercise=="Yes"],
+                                main="Exercice: Yes",xlab="Glucose Level (mg/dL)"))
> with(HERS.women.wht_diab, qqnorm(glucose[exercise=="No"]))
> with(HERS.women.wht_diab, qqline(glucose[exercise=="No"]))
> with(HERS.women.wht_diab, qqnorm(glucose[exercise=="Yes"]))
> with(HERS.women.wht_diab, qqline(glucose[exercise=="Yes"]))
> par(op)
> 
> # Note:
> # the Q-Q plots are not very useful here since sample size is large enough
> # for valid t-test
> 
> # compute overall and group means
> mean(HERS.women.wht_diab$glucose,na.rm=TRUE)
[1] 96.66043
> group.means <- with(HERS.women.wht_diab, tapply(glucose,exercise,mean,na.rm=TRUE))
> (dobs <- diff(group.means))
      Yes 
-1.692789 
> 
> # now compute t statistic, assuming equal variance (two-sided test)
> (glucose.ttest <- t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=TRUE))

	Two Sample t-test

data:  glucose by exercise 
t = 3.8685, df = 2030, p-value = 0.0001130
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 0.8346242 2.5509539 
sample estimates:
 mean in group No mean in group Yes 
         97.36104          95.66825 

> 
> # compare with Welch corrected t-test (default within R)
> t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=FALSE)

	Welch Two Sample t-test

data:  glucose by exercise 
t = 3.8995, df = 1858.333, p-value = 9.984e-05
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 0.8413953 2.5441828 
sample estimates:
 mean in group No mean in group Yes 
         97.36104          95.66825 

> 
> # given the sample size, one should reasonnably make use of asymptotic
> # convergence of the t distribution to the standard normal
> 1-pnorm(as.numeric(glucose.ttest$statistic))
[1] 5.476017e-05
> 
> ##############################################################################
> ### one-way ANOVA
> HERS$raceth <- factor(HERS$raceth,labels=c("White","Afr Amer","Other"))
> summary(HERS$raceth)
   White Afr Amer    Other 
    2451      218       94 
> 
> # get a numerical summary tabulated by the factor of interest (ethnicity)
> my.summary <- function(x,digits=3) {
+   tmp <- summary(x)
+   tmp <- c(tmp,sd(x),length(x),sum(is.na(x)))
+   names(tmp)[7:9] <- c("Sd","N","NAs")
+   return(round(tmp,digits))
+ }
> with(HERS, by(SBP,raceth,my.summary,digits=2))
INDICES: White
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      Sd       N     NAs 
  83.00  121.00  133.00  134.80  146.00  224.00   18.83 2451.00    0.00 
------------------------------------------------------------ 
INDICES: Afr Amer
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      Sd       N     NAs 
  98.00  123.00  137.00  138.20  150.00  194.00   19.99  218.00    0.00 
------------------------------------------------------------ 
INDICES: Other
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      Sd       N     NAs 
  95.00  119.50  134.00  135.20  148.80  187.00   21.26   94.00    0.00 
> 
> # plot the between-group dispersion around the overall mean
> plot.design(SBP~raceth,data=HERS)
> 
> # perform the ANOVA and display the anova table
> hers.aov <- aov(SBP~raceth,data=HERS)
> summary(hers.aov)
              Df Sum Sq Mean Sq F value Pr(>F)  
raceth         2   2384    1192  3.2981 0.0371 *
Residuals   2760 997618     361                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> 
> model.tables(hers.aov)
Tables of effects

 raceth 
        White Afr Amer   Other
      -0.2857    3.164  0.1114
rep 2451.0000  218.000 94.0000
> 
> # we can check that there is no clear departures from ANOVA assumptions
> op <- par(mfrow=c(2,2))
> plot(hers.aov)
> par(op)
> 
> # Notes:
> # 1. some individual have high residual so we should check if anything else goes
> #    wrong with these observations (916, 2161 and 175) among other covariates...
> # 2. pairwise comparisons should follow
> 
> # As suggested by the authors, we could perform an ANOVA which does not assume
> # equal variance among groups, as we did for the t-test. R. L. Welch proposes to
> # correct the df in a way that is close to what is done in the case of two
> # samples comparison.
> #
> # Welch, R.L. On the comparison of several mean values: An alternative approach,
> # Biometrika 38: 330--336, 1951.
> 
> # though this is not really recommended here, we could perform a non parametric
> # ANOVA (based on ranks)
> kruskal.test(SBP~raceth,data=HERS)

	Kruskal-Wallis rank sum test

data:  SBP by raceth 
Kruskal-Wallis chi-squared = 4.2182, df = 2, p-value = 0.1213

> 
> plot.design(SBP~raceth,data=HERS,fun=median)
> 
> # Note:
> # the test is now non significant!
> 
> ##############################################################################
> ### simple linear regression
> 
> # first, we plot the data
> plot(SBP~age,data=HERS,cex=.5,xlab="Age (years)", ylab="Systolic Blood Pressure (mmHg)")
> 
> # and we add average SBP for each decile of age
> age.decile <- quantile(HERS$age,probs=seq(0,1,.1))
> 
> tmp <- as.numeric(age.decile)
> xx <- numeric(length(tmp)-1) # mid-interval points
> for (i in 1:(length(tmp)-1)) xx[i] <- (tmp[i]+tmp[i+1])/2
> HERS$ageq <- cut(HERS$age,breaks=age.decile,labels=FALSE)
> points(xx,as.numeric(tapply(HERS$SBP,HERS$ageq,mean)),pch=15,cex=1.2)
> 
> # compute the linear regression
> HERS.lm <- lm(SBP~age,data=HERS)
> summary(HERS.lm)

Call:
lm(formula = SBP ~ age, data = HERS)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.405 -13.150  -1.405  11.378  85.934 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 103.62949    3.59573  28.820   <2e-16 ***
age           0.47173    0.05368   8.787   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 18.77 on 2761 degrees of freedom
Multiple R-squared: 0.02721,	Adjusted R-squared: 0.02685 
F-statistic: 77.21 on 1 and 2761 DF,  p-value: < 2.2e-16 

> 
> # add the regression line to the previous plot
> abline(HERS.lm)
> 
> # some diagnostics plots
> op <- par(mfrow=c(2,2))
> plot(HERS.lm)
> par(op)
> 
> # Note:
> # Results differ from those presented by the authors because they consider only
> # a 10% random sample of the total sample (cf. chap3.do on the website)
> 
> # doing the analysis with a random sample (10% of the total size)
> set.seed(12386)
> idx <- sample(HERS$id,size=nrow(HERS)/10)
> HERS.sample10 <- HERS[idx,]
> HERS.lm2 <- lm(SBP~age,data=HERS.sample10)
> summary(HERS.lm2)

Call:
lm(formula = SBP ~ age, data = HERS.sample10)

Residuals:
    Min      1Q  Median      3Q     Max 
-41.238 -12.928  -1.258  11.052  63.242 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 109.8778    11.1384   9.865   <2e-16 ***
age           0.3600     0.1666   2.161   0.0315 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 18.94 on 274 degrees of freedom
Multiple R-squared: 0.01676,	Adjusted R-squared: 0.01318 
F-statistic: 4.672 on 1 and 274 DF,  p-value: 0.03153 

> 
> # we now can predict SBP for a woman of average age
> as.numeric(HERS.lm$coefficients[1] + HERS.lm$coefficients[2]*mean(HERS$age))
[1] 135.0695
> 
> # but we could also create a new dataframe containing fitted values for each age
> # along with confidence interval for prediction (larger than CI for estimation)
> HERS.lm.fit <- data.frame(age=seq(min(HERS$age),max(HERS$age),by=1))
> fitted.values <- predict(HERS.lm,HERS.lm.fit,interval="prediction")
> fitted.values2 <- predict(HERS.lm,HERS.lm.fit,interval="confidence")
> HERS.lm.fit <- cbind(HERS.lm.fit,fitted.values,fitted.values2[,2:3])
> colnames(HERS.lm.fit)[5:6] <- c("lwr2","upr2")
> 
> with(HERS.lm.fit, {
+   matplot(age,cbind(lwr,upr),type="l",xlab="Age",ylab="Predicted SBP",col=2,lty=2)
+   matlines(age,cbind(lwr2,upr2),type="l",xlab="Age",ylab="Predicted SBP",col=3,lty=2)
+   lines(age,fit)
+   points(mean(HERS$age),mean(HERS$SBP))
+ })
> legend("topleft",c("Prediction","Confidence"),col=2:3,lty=2,title="CI",bty="n")
> 
> # computing confidence interval for regression parameters using bootstrap
> library(boot)
> boot.fun <- function(data,x) return(as.numeric(lm(data[x,2]~data[x,1])$coefficients[2]))
> 
> sbp.age <- cbind(HERS$age,HERS$SBP)
> res <- boot(sbp.age,boot.fun,1000)
> res

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = sbp.age, statistic = boot.fun, R = 1000)


Bootstrap Statistics :
     original      bias    std. error
t1* 0.4717281 0.003394972  0.05617798
> 
> # two-sided bootstrapped confidence interval of Cronbach's alpha
> quantile(res$t,c(0.025,0.975))
     2.5%     97.5% 
0.3661607 0.5884125 
> 
> # std error
> sqrt(diag(cov(res$t)))
[1] 0.05617798
> 
> # adjusted bootstrap percentile (BCa) confidence interval (better)
> # Note:
> # The boot.ci( ) function takes a bootobject and generates 5 different types
> # of two-sided nonparametric confidence intervals. These include the first
> # order normal approximation, the basic bootstrap interval, the studentized
> # bootstrap interval, the bootstrap percentile interval, and the adjusted
> # bootstrap percentile (BCa) interval.
> boot.ci(res,type=c("norm","perc"))  # normal approx. + percentile method
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates

CALL : 
boot.ci(boot.out = res, type = c("norm", "perc"))

Intervals : 
Level      Normal             Percentile     
95%   ( 0.3582,  0.5784 )   ( 0.3644,  0.5899 )  
Calculations and Intervals on Original Scale
> 
> # the following displays a graphical summary of the bootstrap procedure
> plot(res)
> 
> # Note:
> # There is a great tutorial on boostrapping techniques written by John Fox,
> # which is available at the following address:
> # http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-bootstrapping.pdf
> 
> 
> ##############################################################################
> ### two-way contingency table
> 
> # loading the data
> WCGS <- read.table("wcgs.txt",header=TRUE,sep="\t")
> WCGS$chd69 <- factor(WCGS$chd69,labels=c("Noncases","Cases"))
> WCGS$arcus <- factor(WCGS$arcus,labels=c("Unexposed","Exposed"))
>                      
> # display a basic two-way table
> chd69.tab <- with(WCGS, table(chd69,arcus))
> chd69.tab
          arcus
chd69      Unexposed Exposed
  Noncases      2058     839
  Cases          153     102
> 
> # interesting summary are provided by the `vcd` package
> # however, we are more interested by association measures
> library(vcd)
> assocstats(chd69.tab)
                    X^2 df   P(> X^2)
Likelihood Ratio 12.984  1 0.00031411
Pearson          13.638  1 0.00022163

Phi-Coefficient   : 0.066 
Contingency Coeff.: 0.066 
Cramer's V        : 0.066 
> 
> # here are the functions we need (but see the `epi` and `epitools` packages)
> odds.ratio <- function(x,alpha=.05) {
+   if (!is.table(x)) x <- as.table(x)
+   pow <- function(x,a=-1) x^a
+   or <- (x[1,1]*x[2,2]) / (x[1,2]*x[2,1])
+   or.se <- sqrt(sum(pow(x)))
+   or.ci <- exp(log(or) + c(-1,1)*qnorm(1-alpha/2)*or.se)
+   return(list(OR=or,SE=or.se,CI=or.ci))
+ }
> 
> # Note:
> # we don't use Cornfield approximation when computing confidence
> # interval (Cornfield approximation relates in fact to SE calculation).
> #
> # See http://www.stata.com/help.cgi?cci for the Stata on-line help
> #
> # and the 'official' reference for CI computation with matched cases
> # design:
> # J J Schlesselman. Case-Control Studies. Design, Conduct, Analysis.
> # Oxford University Press, 1982.
> #
> # Also see
> # C-Y Lin & M-C Yang, Improved Exact Confidence Intervals for the
> # Odds Ratio in Two Independent Binomial Samples, Clinical Trials,
> # 48(6): 1008-1019, 2006.
> #
> # C C Brown, The Validity of Approximation Methods for Interval
> # Estimation of the Odds Ratio, American Journal of Epidemiology,
> # 113(4): 474-480, 1981.
> # 
> 
> relative.risk <- function(x,srow=1,scol=1) {
+   # we assume that Cases/Exposed is contained in cell [srow,scol]
+   out <- c(as.numeric(x[srow,scol]/apply(x,2,sum)[scol]),as.numeric(x[srow,scol+1]/apply(x,2,sum)[scol+1]))
+   names(out) <- c("Cases+","Cases-")
+   return(out)
+ }
> 
> # The following functions are adapted from
> # S. BOLBOACA and A. ACHIMAS CADARIU. Binomial Distribution
> # Sample Confidence Intervals Estimation. 6. Excess Risk
> # http://lejpt.academicdirect.org/A04/01_21.htm
> #
> # Here, we are expressing excess risk as
> # ER = a/(a+b) - c/(c+d) = y/n - x/m
> # for the classical 2x2 table
> #
> #     -----------
> #     |  + |  - |
> # |-------------- 
> # | + |  a |  b |
> # |--------------
> # | - |  c |  d |
> # |--------------
> #
> 
> dwald <- function(x,m,y,n,z=1.96) {
+   lwr <- max(c(y/n-x/m-z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),-1))
+   upr <- min(c(y/n-x/m+z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),1))
+   return(c(lwr,upr))
+ }
> 
> dac <- function(x,m,y,n,z=1.96) {
+   return(dwald(x+z^2/(4*sqrt(2)),m+z^2/(2*sqrt(2)),y+z^2/(4*sqrt(2)),n+z^2/(4*sqrt(2)),z))
+ }
> 
> # odds-ratio
> odds.ratio(chd69.tab)
$OR
[1] 1.63528

$SE
[1] 0.1342299

$CI
[1] 1.257000 2.127399

> 
> # relative risk
> chd69.rr <- relative.risk(chd69.tab)
> excess.risk <- as.numeric(abs(diff(chd69.rr)))
> dac(chd69.tab[1,2],chd69.tab[1,1]+chd69.tab[1,2],chd69.tab[2,2],chd69.tab[2,1]+chd69.tab[2,2])
[1] 0.04956725 0.17420299
> # Note:
> # I'm quite confuse for the moment since this doesn't seem to provide
> # a logical result...
> risk.ratio <- as.numeric(chd69.rr[1]/chd69.rr[2])
> 
> # global association test
> # we don't need the correction for continuity when using chi square test
> chisq.test(chd69.tab,correct=F)

	Pearson's Chi-squared test

data:  chd69.tab 
X-squared = 13.6382, df = 1, p-value = 0.0002216

> 
> # the `epitools` package offers facilities for computing the above
> # estimates, together with test of association
> # Note:
> # CI for odds-ratio are estimated using Wald approximation
> # We can add the option verbose=TRUE for a more detailed output
> library(epitools)
> epitab(chd69.tab,pvalue="chi2")
$tab
          arcus
chd69      Unexposed         p0 Exposed        p1 oddsratio    lower    upper
  Noncases      2058 0.93080054     839 0.8916047   1.00000       NA       NA
  Cases          153 0.06919946     102 0.1083953   1.63528 1.257000 2.127399
          arcus
chd69           p.value
  Noncases           NA
  Cases    0.0002216321

$measure
[1] "wald"

$conf.level
[1] 0.95

$pvalue
[1] "chi2"

> 
> # the HIV Status by AIDS Diagnosis of Male Partner data
> HIV <- matrix(c(3,2,4,22),nrow=2)
> dimnames(HIV) <- list(c("Cases","Noncases"),c("Exposed","Unexposed"))
> HIV
         Exposed Unexposed
Cases          3         4
Noncases       2        22
> 
> epitab(HIV)
$tab
         Exposed  p0 Unexposed        p1 oddsratio    lower    upper    p.value
Cases          3 0.6         4 0.1538462      1.00       NA       NA         NA
Noncases       2 0.4        22 0.8461538      8.25 1.028252 66.19241 0.06192065

$measure
[1] "wald"

$conf.level
[1] 0.95

$pvalue
[1] "fisher.exact"

> 
> # compare OR estimate with that obtained using mid-p method
> # Kenneth J. Rothman and Sander Greenland (1998), Modern Epidemiology,
> # Lippincott-Raven Publishers (p. 251)
> or.midp(c(3,4,2,22))
$x
     [,1] [,2]
[1,]    3    4
[2,]    2   22

$estimate
[1] 7.332308

$conf.int
[1]  0.868172 80.674513

$conf.level
[1] 0.95

attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
> 
> # alternatively, we can directly make use of Fisher exact test
> fisher.test(HIV)

	Fisher's Exact Test for Count Data

data:  HIV 
p-value = 0.06192
alternative hypothesis: true odds ratio is not equal to 1 
95 percent confidence interval:
   0.6484488 117.9447821 
sample estimates:
odds ratio 
   7.48285 

> fisher.test(HIV,alternative="greater")

	Fisher's Exact Test for Count Data

data:  HIV 
p-value = 0.06192
alternative hypothesis: true odds ratio is greater than 1 
95 percent confidence interval:
 0.8981707       Inf 
sample estimates:
odds ratio 
   7.48285 

> 
> # Now, if we use the `epicalc` package, we get all in one
> library(epicalc)
> HIV.tab <- make2x2(3,2,4,22)
> # csi(3,2,4,22)
> cc(cctable=HIV.tab)

              Exposure
Outcome        Non-exposed Exposed Total
  Non-diseased          22       2    24
  Diseased               4       3     7
  Total                 26       5    31

OR =  7.48 
95% CI = 0.65 117.94 
Chi-squared = 4.77 ,  1 d.f. , P value = 0.029 
Fisher's exact test (2-sided) P value = 0.062 

One of more cells is/are less than 5, not appropriate for graphing 
 
> cs(cctable=HIV.tab)

              Exposure
Outcome        Non-exposed Exposed Total
  Non-diseased 22          2       24   
  Diseased     4           3       7    
  Total        26          5       31   
                                        
               Rne         Re      Rt   
  Risk         0.15        0.6     0.23 

                                         Estimate Lower95ci Upper95ci
 Risk difference (attributable risk)     0.45     0.04      0.85     
 Risk ratio                              3.9      0.74      20.63    
 Attr. frac. exp. -- (Re-Rne)/Re         0.74                        
 Attr. frac. pop. -- (Rt-Rne)/Rt*100 %   31.87                       

> 
> # are the OR constant across subgroups of age (Mantel-Haenzel test)
> # WCGS$agec is a 5-level factor
> 
> # first we look at a simple two-way classification
> (chd69.tab2 <- with(WCGS, table(chd69,agec)))
          agec
chd69         0    1    2    3    4
  Noncases  512 1036  680  463  206
  Cases      31   55   70   65   36
> # column profile
> apply(chd69.tab2,1,function(x) round(x/apply(chd69.tab2,2,sum)*100,2))
   chd69
    Noncases Cases
  0    94.29  5.71
  1    94.96  5.04
  2    90.67  9.33
  3    87.69 12.31
  4    85.12 14.88
> 
> # global association test with chi square
> chisq.test(chd69.tab2)

	Pearson's Chi-squared test

data:  chd69.tab2 
X-squared = 46.6534, df = 4, p-value = 1.801e-09

> 
> # a more detailed inspection of association estimates can be obtained
> # using
> # cc(cctable=chd69.tab2)
> 
> # Note:
> # I have not yet calculated the score test trend of odds
>  
> # Twenty-year Vital Status by Smoking Behavior, Stratified by Age
> # are not available as raw data, so we will use the WCGS study
> chd69.tab3 <- xtabs(~chd69+arcus+agec,data=WCGS)
> 
> # we can get OR by strata as follows
> apply(chd69.tab3,3,function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1]))
        0         1         2         3         4 
1.5482625 1.8703704 2.0288066 0.8463815 0.9252336 
> 
> # run the Mantel-Haenszel test and an Homogeneity of OR test
> mhor(mhtable=chd69.tab3)

Stratified analysis by  agec 
                OR lower lim. upper lim. P value
agec 0       1.547      0.606       3.64  0.2672
agec 1       1.869      0.993       3.42  0.0454
agec 2       2.027      1.195       3.43  0.0073
agec 3       0.847      0.465       1.51  0.5849
agec 4       0.926      0.427       2.00  0.8582
M-H combined 1.380      1.056       1.80  0.0164

M-H Chi2(1) = 5.76 , P value = 0.016 
Homogeneity test, chi-squared 4 d.f. = 7.72 , P value = 0.102 

> 
> # alternatively, one can use (wht correction for continuity to get
> # similar result)
> with(WCGS, mantelhaen.test(chd69,arcus,agec,correct=FALSE))

	Mantel-Haenszel chi-squared test without continuity correction

data:  chd69 and arcus and agec 
Mantel-Haenszel X-squared = 5.756, df = 1, p-value = 0.01643
alternative hypothesis: true common odds ratio is not equal to 1 
95 percent confidence interval:
 1.056304 1.802479 
sample estimates:
common odds ratio 
         1.379842 

> 
> 
> ##############################################################################
> ### survival data
> 
> LEUK <- read.table("leuk.txt",header=TRUE)
> 
> library(survival)
> # create a survival object
> LEUK.surv <- Surv(LEUK$time,LEUK$cens)
> 
> # Kaplan-Meier estimate
> LEUK.km <- survfit(LEUK.surv~group,data=LEUK)
> plot(LEUK.km[1])
> lines(LEUK.km[2],col="gray60")
> legend("topright",c("6-MP","Placebo"),lty=1,col=c("black","gray60"),bty="n")
> title(xlab="Weeks since randomization",ylab="Proportion in remission")
> 
> # numerical summary
> LEUK.km
Call: survfit(formula = LEUK.surv ~ group, data = LEUK)

         n events median 0.95LCL 0.95UCL
group=1 21      9     23      16     Inf
group=2 21     21      8       4      12
> summary(LEUK.km)
Call: survfit(formula = LEUK.surv ~ group, data = LEUK)

                group=1 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    6     21       3    0.857  0.0764        0.720        1.000
    7     17       1    0.807  0.0869        0.653        0.996
   10     15       1    0.753  0.0963        0.586        0.968
   13     12       1    0.690  0.1068        0.510        0.935
   16     11       1    0.627  0.1141        0.439        0.896
   22      7       1    0.538  0.1282        0.337        0.858
   23      6       1    0.448  0.1346        0.249        0.807

                group=2 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    1     21       2   0.9048  0.0641      0.78754        1.000
    2     19       2   0.8095  0.0857      0.65785        0.996
    3     17       1   0.7619  0.0929      0.59988        0.968
    4     16       2   0.6667  0.1029      0.49268        0.902
    5     14       2   0.5714  0.1080      0.39455        0.828
    8     12       4   0.3810  0.1060      0.22085        0.657
   11      8       2   0.2857  0.0986      0.14529        0.562
   12      6       2   0.1905  0.0857      0.07887        0.460
   15      4       1   0.1429  0.0764      0.05011        0.407
   17      3       1   0.0952  0.0641      0.02549        0.356
   22      2       1   0.0476  0.0465      0.00703        0.322
   23      1       1   0.0000      NA           NA           NA

> 
> # comparing the two groups with log-rank test using survdiff()
> # rho = 0 (default) gives logrank test while rho=1 gives Wilcoxon test
> survdiff(LEUK.surv~group,data=LEUK) 
Call:
survdiff(formula = LEUK.surv ~ group, data = LEUK)

         N Observed Expected (O-E)^2/E (O-E)^2/V
group=1 21        9     19.3      5.46      16.8
group=2 21       21     10.7      9.77      16.8

 Chisq= 16.8  on 1 degrees of freedom, p= 4.17e-05 
> 
> dev.off() # release the postscript device
null device 
          1 
>