\documentclass[11pt]{article}
\newcommand{\s}{\textsf{S}}
\begin{document}
\tableofcontents
\vspace*{3em}
%\newpage
\section{S-language}
\emph{Problem set I}
\begin{enumerate}
\item Write and test a command that creates absolute values; other
than \texttt{abs()}.
\item Write and test a command that evaluates the standard error of
the mean associated with a vector of $n$ values labeled $x$ without
using \texttt{var(x)} where
\[
standard\; error=\sqrt{\frac{\sum(x_i-\bar{x})^2}{n(n-1)}}.
\]
\item Write and test a set of commands that calculates the square root
of $T$ using the relationship $x_{i+1}=(x_i+T/x_i)/2$.
\item Compare the \s-function \texttt{pnorm()} to the approximation
$P(x)$ where
\[
P(x)=\frac{1+\sqrt{1-e^{-2x^2/\pi}}}{2}
\]
when $x$ is between 0 and $\infty$.
Find the maximum difference between these two functions and the
value at which the maximum difference occurs.
\item Pascal's triangle is
\begin{center}
\begin{tabular}{cc}
row 1 & 1 \\
row 2 & 1 1 \\
row 3 & 1 2 1 \\
row 4 & 1 3 3 1 \\
row 5 & 1 4 6 4 1 \\
& ... \\
\end{tabular}
\end{center}
Write and test an \s-function thta generates the $k$th row. Verify
thta the rows sum to $2^k$.
\item The estimated standard deviation is
\[
S=\sqrt{\frac{\sum(x_i-\bar{x})^2}{n-1}}.
\]
A correction for the bias of this estimate is
\[
\alpha_n=\sqrt{\frac{2}{n}}\frac{\Gamma(n/2)}{\Gamma([n-1]/2)}
\]
where $\Gamma(x)$ represents a gamma function evaluated at
$x$. Calculate $\alpha_n$ for $n=2,3,4,\cdot\cdot\cdot,50$.
Show that
\[
\alpha_n=\frac{3.5n-3.62}{3.5n-1}
\]
is a better approximation.
\item Write and test \s-commands that create the following two patterns
of numbers:
\[\begin{array}{l}
1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 \\
1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4 5 5 5 5 5
\end{array}
\]
Write and test \s-commands thta will generate this pattern in
general; that is, for any chosen integer.
\item Create an \s-vector with 1000 values set to 1. Then add one to
every second value, then add one to every third value, then add one
to every fourth value, and so on 1000 times. Which values are odd
and which are even? Justify the observed pattern.
\item Write and test an \s-function that accepts two vectors of
observations $x$ and $y$ as input and returns the Spearman rank
correlation (the correlation coefficient calucalted using ranks of
the observations).
Verify the \s-code using \texttt{cor.test(x, y, method=spearman)}.
\item A perfect shuffle of a deck of 52 playing cards occurs when the
deck is split perfectly into halves (26/26) and the cards are
exactly alternated. If the top card remains on the top after each
perfect shuffle, how many perfect outside shuffles are necessary to
restore the original order? If the top card becomes the second card
on each shuffle, the shuffle is called a perfect inside shuffle. If
the deck is ordered before the first shuffle, how many perfect
inside shuffles are necessary to restore the original order?
\item Generate a vector (denotes \texttt{ybar}) containing $k=20$ mean
values each composed of $n=50$ random observations from a normal
distribution with mean = 2 and standard deviation = 2.
Generate \texttt{ybar} using a ``\texttt{for}-loop.''
Generate \texttt{ybar} without using a ``\texttt{for}-loop.''
Increase $k$ and $n$ and note the difference in execution times.
\item Evaluate
\[
f(x,y)=\frac{\sin(x)}{\sqrt{1+\cos^2(y)}}
\]
over the range $-2\pi9$, $P_m\approx\frac{1}{m!}e^{-1}$.
\end{enumerate}
\section{Descriptive techniques}
\emph{Problem set II}
\begin{enumerate}
\item Plot the function $f(x)=x\log(x)$ for $0z_0
\]
where $P=P(Z100$ and $p<0.05$, the binomial and
Poisson distributions produce similar probabilities.
\item Demonstrate using \s-functions such as \texttt{qqnorm()} that
the Box-Muller transformation gives two independent standard normal
variates from two independent and uniformally distributed random
variables.
\item Plot a square with a circle inscribed within the boundaries.
Generate random pairs $(x,y)$ and determine whether these values are
in the circle or not. Use the result to estimate $\pi=3.1416$.
\item Demonstrate with an \s-simulation program that the expected mean
values resulting from sampling the same population with and without
replacement are equal. Which sampling has the smallest variance?
\item Create and test an \s-function that produces $n$ random values
from a Poisson distribution using the inverse transformation method
for given value of $\lambda$. Compare the results to the values
generated with \texttt{rpois()}.
\item The test-statistic $X^2=(n-1)S_x^2/\bar{x}$ has an approximate
chi-square distribution with $n-1$ degrees of freedom when the $n$
values $x_i$ are sampled from a Poisson distribution. Use a
simulation program to verify that $X^2$ has an approximate
chi-square distribution for $n=100$ and $\lambda=1.0$. Use this fact
to test formally the fit of the Rutherford-Geiger data (Chapter 3)
to a Poisson distribution.
\end{enumerate}
\section{General Linear Models}
\emph{Problem set IV}
\begin{enumerate}
\item A measure of influence on the estimated value $\hat{y}_i$
associated with the $i$th independent variable $x_i$ can be defined
as $\hat{b}-\hat{b}_{(i)}$ where $\hat{b}$ is estimated from all $n$
observations and $\hat{b}_{(i)}$ is estimated from the same data set
but with the $i$th observation removed. Compute all such values for
the diastolic blood pressure data (Table~4.1) and determine the five
most influential points. Locate the five points on a plot of the
data.
\item Show\footnote{Solve theoretically and not with a computer
program.} that if the coefficients $a_i$ maximize the multivariate
distance $M^2$, then the coefficients $ba_i+c$ also maximize $M^2$
where $b$ and $c$ are constants.
\item The goodness-of-fit \s-code for the logistic model in the
chapter requires the number of observations to be evenly divisible
by 10. Write and test an \s-program for the same goodness-of-fit
test but make no assumptions about the total number of observations
used in the logistic regression analysis.
\item The table showing case/control status and coeffe consumption
(Table~4.7) is a summary of $n=1010$ individuals where the
frequencies in the table are the counts of observations with the
identical values of the dependent and independent variables. For
example, there are nine records (one for each person) where the
outcome is a case who reports no coffee consumption ($x_1=0$) and is
a male participant ($x_2=1$)---first cell in the summary table.
Using Table~4.7 reconstruct the data so that tehre are 1010
individuals records where each record contains 0 or 1 for
case/control status as well as the corresponding values of the
independent variables ($x_1$ and $x_2$).
Use these $n=1010$ records and \texttt{glm()} and conduct a logistic
regression analysis showing that the results are identical to the
ones in the chapter where the analysis is performed directly on the
tabular data using the cell frequencies as weights.
\item Consider the following data where birth weight and maternal age
are recorded for three groups based on smoking exposure status:
\begin{center}
\begin{tabular}{lrr|lrr|lrr}
& non-smoker & & & quitters & & & smokers & \\
\hline
& bwt & age & & bwt & age & & bwt & age \\
\hline
1 & 9.1 & 35 & 1 & 7.2 & 32 & 1 & 6.7 & 24 \\
2 & 8.9 & 29 & 2 & 7.7 & 30 & 2 & 6.5 & 24 \\
3 & 8.5 & 34 & 3 & 6.8 & 26 & 3 & 7.2 & 28 \\
4 & 7.4 & 32 & 4 & 7.0 & 33 & 4 & 6.5 & 26 \\
5 & 7.5 & 28 & 5 & 7.4 & 28 & 5 & 6.5 & 26 \\
6 & 7.3 & 28 & 6 & 6.2 & 29 & 6 & 7.1 & 26 \\
\end{tabular}
\end{center}
Using \s-tools conduct a separate simple linear regression analysis
for each smoking exposure group.
Use the same data and the model $y=a+b_1x+b_2g_1+b_3g_2+b_4g_1x+b_5g_2x$
to conduct a linear regression analysis using all 18 observations
simultaneously where $y$ = birth weight (dependent variable) and $x$
= age (independent variable). The design variable $g$ is defined as
$g_1=g_2=0$ for non-smokers, $g_1=0$, $g_2=1$ for quitters and
$g_1=1$, $g_2=0$ for smokers.
Demonstrate these two approaches are identical.
Formally, test the influence of the three-level smoking exposure
categorical variable on birth weight.
\item The following data\marginpar{\small data incl. in
\texttt{oakRNL.txt}} are deaths from lung cancer and person-years at
risk, classified by age and exposure to radiation for workers at the
Oak Ridge National Laboratory.
\begin{center}
\begin{small}
\begin{tabular}{ccrr|ccrr|ccrr}
\hskip-5em age & mSv & deaths & p-years & age & mSv & deaths & p-years & age
& mSv & deaths & p-years \\
\hskip-5em 1 & 1 & 0 & 29901 & 2 & 3 & 2 & 2423 & 3 & 5 & 0 & 476\\
\hskip-5em 2 & 1 & 1 & 6251 & 3 & 3 & 1 & 2281 & 4 & 5 & 0 & 387\\
\hskip-5em 3 & 1 & 4 & 5251 & 4 & 3 & 2 & 1918 & 5 & 5 & 0 & 225\\
\hskip-5em 4 & 1 & 3 & 4126 & 5 & 3 & 0 & 1322 & 6 & 5 & 1 & 164\\
\hskip-5em 5 & 1 & 3 & 2778 & 6 & 3 & 2 & 723 & 7 & 5 & 0 & 150\\
\hskip-5em 6 & 1 & 1 & 1607 & 7 & 3 & 3 & 538 & 1 & 6 & 0 & 779\\
\hskip-5em 7 & 1 & 3 & 1358 & 1 & 4 & 0 & 2341 & 2 & 6 & 0 & 296\\
\hskip-5em 1 & 2 & 1 & 71382 & 2 & 4 & 0 & 972 & 3 & 6 & 0 & 282\\
\hskip-5em 2 & 2 & 5 & 16705 & 3 & 4 & 1 & 958 & 4 & 6 & 1 & 251\\
\hskip-5em 3 & 2 & 4 & 13752 & 4 & 4 & 1 & 816 & 5 & 6 & 0 & 193\\
\hskip-5em 4 & 2 & 10 & 10439 & 5 & 4 & 0 & 578 & 6 & 6 & 0 & 125\\
\hskip-5em 5 & 2 & 11 & 7131 & 6 & 4 & 2 & 375 & 7 & 6 & 0 & 69\\
\hskip-5em 6 & 2 & 16 & 4133 & 7 & 4 & 3 & 303 & 1 & 7 & 0 & 520\\
\hskip-5em 7 & 2 & 11 & 3814 & 1 & 5 & 0 & 1363 & 2 & 7 & 0 & 188\\
\hskip-5em 1 & 3 & 0 & 6523 & 2 & 5 & 0 & 478 & 3 & 7 & 0 & 217\\
\hskip-5em 4 & 7 & 0 & 184 & 5 & 7 & 1 & 109 & 6 & 7 & 0 & 60\\
\hskip-5em 7 & 7 & 1 & 23 & 1 & 8 & 0 & 2104 & 2 & 8 & 0 & 1027\\
\hskip-5em 3 & 8 & 1 & 1029 & 4 & 8 & 3 & 827 & 5 & 8 & 1 & 555\\
\hskip-5em 6 & 8 & 2 & 297 & 7 & 8 & 2 & 153 &&&& \\
\end{tabular}
\end{small}
\end{center}
Recode age categories 1, 2, 3, 4, 5, 6, 7 into ages 45, 47.5, 52.5,
57.5, 62.5, 67.5, and 70 years. Recode exposure categories 1, 2, 3,
4, 5, 6, 7, 8 into exposures 0, 15, 30, 50, 70, 90, 110, and 120~mSv
(milliseiverts).
Evaluate the exposure response in these data using a Poisson
regression approach.
When the open-ended (last) interval coded at 120~mSv is recoded to
160~mSv, assess the impact on the exposure/risk relationship using a
Poisson regression analysis.
Plot the impact on the dose-response relationship varying the
definitions of the coded value for the last exposure group (e.g.,
120, 130, 140, $\cdot\cdot\cdot$, 220).
\item Generate a sample of $n=200$ random observations that are
described by the logistic model
\[
p_i=\frac{1}{1+e^{-(a+bx_i)}}
\]
where $a$ and $b$ are such that $or=3.0$.
Using \texttt{glm()} and the simulated data, estimate the odds ratio.
\end{enumerate}
\section{Estimation}
\emph{Problem set V}
\begin{enumerate}
\item The state fish and game service requires salmon catches to be
reported from any boat catching one or more fish. The boats that do
not catch fish, do not report. The data are, therefore, truncated
since the number of boats failing to catch fish are not recorded. An
example of such data is
\begin{center}
\begin{tabular}{l|rrrrrr}
number of fish & 1 & 2 & 3 & 4 & 5 & 6\\
\hline
boats & 34 & 25 & 12 & 5 & 1 & 0\\
\end{tabular}
\end{center}
Assume that the number of salmon caught per boat are described by a
Poisson distribution where
\[
f(x_i|\lambda)=\frac{e^{-\lambda}\lambda^{x_i}/x_i!}{1-e^{-\lambda}}\qquad
i=1,2,3,\cdot\cdot\cdot .
\]
The symbol $x_i$ represents the number of fish caught per boat. Use
the \s-function \texttt{ms()} to find the maximum likelihood
estimate of $\lambda$.
Using the scoring technique to estimate parameters, find the maximum
likelihood estimate of $\lambda$ and an estimate of its variance.
Use the \texttt{uniroot()} \s-function to find the maximum
likelihood estimate of $\lambda$.
\item Consider the situation where the number of observations below
$c_0$ is known but the actual values of the observations are not
known (i.e., the distribution is left censored at $c_0$). Also the
number of observations above $c'_0$ is known but the actual values
of the observations are not known (i.e., the distribution is also
right censored at $c'_0$). Further assume that the sampled
distribution is normally distributed with mean $\mu$ and standard
deviation $\sigma$. Write and test an \s-code program to estimate
$\mu$ and $\sigma$ for this doubly censored normal distribution from
$n$ observations where $n_0$ are left censored, $n'_0$ are right
censored and $n-n_0-n'_0$ are measured values.
\item Consider the following model constructed to estimate the
proportion of dizygotic twins where
\begin{center}
\begin{small}
probability of a like-sex twin pair = P(like-sex twin) = $M+D/2$
and\newline
probability of an unlike-sex twin pairs = P(unlike-sex twin) = $D/2$
\end{small}
\end{center}
where $D=1-M$ is the proportion of dizygotic (fraternal twins). A
specific number of pairs of like-sex twins = 67 and unlike-sex twins
= 42 are observed.
Find\footnote{Solve theoretically and not with a computer
program.} the maximum likelihood estimate of $D$ and its variance
in closed form.
Use an \s-code program to estimate $D$ and its variance using
scoring techniques.
Use a bootstrap procedure to estimate $D$ and its variance.
\item A measure of skewness is
\[
\hat{M}=\frac{\sum(x_i-\bar{x})^3}{n}
\]
where $M=0$ identifies a symmetric distribution using a sample of
$n$ observations. For the data $\{2,5,8,2,5,9,1,4,30\}$ estimate $M$
and its standard error using a boostrap procedure (i.e., find
$\hat{M}_{[\cdot]}$ and $\hat{se}(\hat{M}_{[\cdot]})$ estimates).
Find the same estimates using the jacknife procedure.
Estimate $M$ and its standard error using $n=100$ values sampled
from a standard normal distribution using both estimation
techniques.
\item Consider the following 15 observations:
\begin{small}\[
0.28,-1.21,0.60,0.14,0.51,0.19,-0.27,0.45,0.29,0.40,0.04,0.60,1.11,0.90.
\]\end{small}
Use a bootstrap strategy to assess the likelihood this sample arose
from a population with a mean value of 0 ($\mu_0=0$?).
\item a sample of data yields the following $2\times 2$ table:
\begin{center}
\begin{tabular}[!h]{c|cc|c}
& disease & no disease & total \\
\hline
exposed & $a=200$ & $b=120$ & 320 \\
unexposed & $c=80$ & $d=120$ & 200 \\
\hline
total & 280 & 240 & 520 \\
\end{tabular}
\end{center}
The odds ratio measure of association between disease and exposure
is estimated by $\hat{or}=ad/bc$. Construct and test an \s-program
to find the bootstrap estimate of the bias associated with this
estimate. The logarithm of the estimated odds ratio
($\log[\hat{or}]$) is another measure of association. Use an
\s-program to estimate of the bias associated with this
measure of association.
\item For the data
\[\begin{array}{l}
x=5,10,15,20,25,30,35,40,45\quad \textrm{and}\\
y=0.08,0.12,0.22,0.21,0.27,0.56,0.70,0.71,0.84
\end{array}\]
use the model $y_i=[1+e^{-(a+bx_i)}]^{-1}$ and
\texttt{nls()}-function to estimate $a$ and $b$. Hint: use initial
value $a_0=-3$ and $b_0=0.1$.
Apply a linearizing transformation to $y$ and again estimate the
parameters $a$ and $b$ usng ordinary least squares estimation.
\item Use a boostrap procedure to estimate $\theta$, its standard
error, and the bias for
\[
\hat{\theta}=\frac{1}{n}\sum(x_i-\bar{x})^2
\]
where $n=15$ and
$x=\{12,13,23,31,41,22,44,37,14,18,24,36,51,11,\\32\}$.
Use a boostrap procedure to estimate $\theta$, its standard error,
and the bias from a sample of $n=100$ random values selected from a
standard distribution.
\item For the two sets of $n=15$ observations
\[
x=\{1,3,2,6,8,3,8,3,9,10,15,12,18,5,2\}
\]
and
\[
y=\{10,14,15,22,28,21,14,15,12,18,33,37,33,11,12\}
\]
write and test and \s-program to assess the conjecture that $x$ and
$y$ are samples of unrelated variables (\emph{correlation} = 0)
using a randomization strategy.
\item A simple Mendelian genetic system is represented by the
following model:
\begin{center}
\begin{tabular}{ll}
AA-homozygote frequency: & $p^2$ \\
Aa-heterozygote frequency: & $2pq$ \\
aa-homozygote frequency: & $q^2$\\
\end{tabular}
\end{center}
where $p$ represents the frequency of the A-gene and $q=1-p$
represents the frequency of the a-gene.
Find\footnote{Solve theoretically and not with a computer
program.} the maximum likelihood estimate of $p$ and its variance
when $n_1$, $n_2$, and $n_3$ represent the respective observed
counts of AA, Aa, and aa genotypes where
\[
\log(L)=n_1\log(p^2)+n_2\log(2pq)+n_3\log(q^2).
\]
If $n_1=250$, $n_2=441$, and $n_3=314$, find the maximum likelihood
estimate of $p$ using \s-tools to verify the ``closed-form''
estimate and variance.
If the laboratory determination of the homozygotes is subject to
misclassification, the log-likelihood function is then
\[
\log(L)=n_1\log(p^2+e)+n_2\log(2pq)+n_3\log(q^2-e)
\]
where $e$ represents the proportion misclassified homozygotic types.
If $n_1=250$, $n_2=441$, and $n_3=314$, find the maximum likelihood
estimate of $p$ and $e$ using \s-tools. Also estimate the
variance/covariance array for the estimates of $p$ and $e$.
\item Generate two sets of $n=100$ random variables where $x$ and $y$
have independent standard normal distributions.
Use bootstrap tools to estimate the correlation between $x$ and
$2x+y$ (i.e., \emph{correlation}($x$,$2x+y$)). Also estimate the
variance and bias associated with this estimate. Plot the histogram
and the estimated density function of the estimated correlation
coefficient.
\end{enumerate}
\section{Analysis of Tabular Data}
\emph{Problem set VI}
\begin{enumerate}
\item For the data in the following table, work out the estimated
values of $\hat{a}$, $\hat{b}_1$, $\hat{b}_2$, and $\hat{b}_3$ for
the saturated model algebraically:
\begin{center}
\begin{tabular}[!h]{rr|rrr}
$a_i$ & $b_i$ & $f_{ij}$ & $F_i$ & data \\
\hline
0 & 0 & $f_{00}$ & $F_1$ & 23 \\
1 & 0 & $f_{10}$ & $F_2$ & 12 \\
0 & 1 & $f_{01}$ & $F_3$ & 45 \\
1 & 1 & $f_{11}$ & $F_4$ & 122 \\
\end{tabular}
\end{center}
Verify the results using an S-program.
\item If $n$ and $p$ are parameters of a binomial distribution, then
\[
z_0=\frac{x-np}{\sqrt{np(1-p)}}\quad \textrm{and}\quad
z_1\frac{x-np\pm 0.5}{\sqrt{np(1-p)}}
\]
provide approximate binomial probabilities (i.e., $pnorm\approx
pbinom$). Compare the maximum difference between the exact binomial
and normal approximated probabilities for $n=10,\, p=0.5$; $n=20,\,
p=0.2$; $n=50,\, p=0.1$; $n=100,\, p=0.05$.
\item Consider the $2\times 4$ table:
\begin{center}
\begin{tabular}[!h]{c||cccc|c}
& $X=0$ & $X=1$ & $X=2$ & $X=3$ & total \\
\hline
$Y=0$ & 1 & 7 & 15 & 40 & 63 \\
$Y=1$ & 4 & 19 & 34 & 42 & 99 \\
\hline
total & 5 & 26 & 49 & 82 & 162 \\
\end{tabular}
\end{center}
Compute $S_{xx}$, $S_{yy}$ and $S_{xy}$.
Calculate the chi-square statistics reflecting the total, linear,
and non-linear influences. Demonstrate that the correlation between
$X$ and $Y$ based on $n=162$ pairs $(X,Y)$, called the biserial
correlation coefficient, is directly related to the chi-square
statistic reflecting the linear association or
\[
r_{XY}=\sqrt{\frac{linear\, chi\, square\, statistic}{n-1}}.
\]
\item Show\footnote{Solve theoretically and not with a computer
program.} algebraically that $or_1/or_2=e^{\hat{b}_7}$ when a
saturated loglinear model is applied to a $2\times 2\times 2$ table
where $or_1$ is the odds ratio calculated from one 2 by 2 subtable
and $or_2$ is the odds ratio calculated from the other subtable.
\item An estimate of the variance of Yule's measure of association $Y$
is
\[
variance(Y)=\frac{1}{4}(1-Y^2)^2\left[\frac{1}{F_1}+\frac{1}{F_2}+
\frac{1}{F_3}+\frac{1}{F_4}\right].
\]
Create an \s-function to calculate $Y$, the variance of $Y$ and to
construct an approximate 95\% confidence interval for the expected
value of $Y$. Show that Yule's $Y$ is equivalent to the odds ratio
measure $\hat{or}$ in a 2 by 2 table.
\item Demonstrate that the expected values calculated under the
hypothesis of statistical independence (usual chi-square expected
values---Chapter 1 example) are essentially the same as the
estimates of the cell frequencies based on a loglinear additive
model using the following data:
\begin{center}
\begin{tabular}[!h]{l|rrrr}
& $b_1$ & $b_2$ & $b_3$ & $b_4$ \\
\hline
$a_1$ & 12 & 8 & 22 & 4 \\
$a_2$ & 5 & 3 & 11 & 2 \\
$a_3$ & 26 & 17 & 44 & 10 \\
$a_4$ & 53 & 38 & 82 & 18 \\
$a_5$ & 108 & 75 & 167 & 44 \\
\end{tabular}
\end{center}
\item Consider the following $2\times 2\times 2$ table:
\begin{center}
\begin{tabular}[!h]{ccc|c|l}
$x_1$ & $x_2$ & $x_3$ & $F_i$ & count \\
\hline
1 & 1 & 0 & $F_1$ & $\hat{F}_1=11+e$ \\
1 & 0 & 0 & $F_2$ & $\hat{F}_2=8-e$ \\
0 & 1 & 0 & $F_3$ & $\hat{F}_3=12-e$ \\
0 & 0 & 0 & $F_4$ & $\hat{F}_4=37+e$ \\
1 & 1 & 1 & $F_5$ & $\hat{F}_5=22-e$ \\
1 & 0 & 1 & $F_6$ & $\hat{F}_6=5+e$ \\
0 & 1 & 1 & $F_7$ & $\hat{F}_7=3+e$ \\
0 & 0 & 1 & $F_8$ & $\hat{F}_8=7-e$ \\
\end{tabular}
\end{center}
Find the value $e$ such that no interaction exists (i.e.,
$\hat{or}_1=\hat{or}_2$). Display the ``data'' ($\hat{F}_i$) and
show that the odds ratios measuring the association between any two
variables at both levels of the third variable are the same. Use a
loglinear model applied to the created ``data'' ($\hat{F}_i=F_i\pm
e$ value) to show that the x1*x2*x3-term is zero (exact homogeneity).
\end{enumerate}
\section{Analysis of Variance and Some Othe \s-Functions}
\emph{Problem set VI}
\begin{enumerate}
\item Use the cholesterol and behavior type data (Table~1.5) to show that a
two-sample $t$-test, a simple linear regression analysis, and an analysis of
variance give identical results. That is, all three approaches produce the
same significance probability for comparing levels of cholesterol between
behavior type A and B.
\item Use the \texttt{glm()} \s-function to reproduce the results found in the
chapter using the \texttt{aov()} \s-command for the two-way table of lead level
determination data (Table~7.2).
\item Construct and test an \s-program to execute a Kruskal-Wallis rank test
for independent samples. Compare the results with the \s-function
\texttt{kruskal.test()} for a set of simulated data with no identical (tied)
values.\\
Construct and test an \s-program to execute a Wilcoxon signed rank test for
a set of matched pairs data. Compare the results with the \s-function
\texttt{wilcox.test()} for a set of simulated data with no identical (tied)
values.
\item Conduct a principal component analysis where $x_1=\{1,2,3,4,5,6,7\}$ and
$x_2=\{7,6,5,4,3,2,1\}$. Calculate the variance of $x_1$, the variance of
$x_2$, the variance of the first principal component, and the variance of
the second principal component. Why are these variances the same?
\item Using the turtle data (Table~4.4) calculate the first two principal
components. To identify differences by gender (clustering), plot the 48
values of each principal component (one against the other).
\item Use the following data to show that a canonical correlation and the
multiple correlation coefficient are the same when one group consists of a
single variable ($y$) and the other group ($x$) has $k=2$ variables (i.e.,
compare results from \texttt{cancor()} with \texttt{lm()} \s-functions).
\begin{center}
\begin{tabular}[!h]{l|rrr}
& $y$ & $x_1$ & $x_2$ \\
\hline
1. & 4.8 & 1 & 0.2 \\
2. & 14.1 & 2 & 0.6 \\
3. & 10.7 & 3 & 0.2 \\
4. & 18.3 & 4 & 0.8 \\
5. & 12.7 & 5 & 0.2 \\
6. & 17.2 & 6 & 1.9 \\
7. & 16.0 & 7 & 1.3 \\
8. & 22.0 & 8 & 1.6 \\
9. & 22.0 & 9 & 1.7 \\
10. & 23.6 & 10 & 1.1 \\
\end{tabular}
\end{center}
\item Using the weight gain matched pair data (Table~7.3), assess the
association between maternal weight gain and low birth weight ignoring the
paired structure (as if the two infants represent samples from separate and
unrelated populations). In other words, does the paired pattern of the data
collection improve the efficiency of the analysis or not?
\item Write and test an \s-program to execute a ranomization test for matched
pair data. Conduct a matched pair randomization test using the paired data
in Table~7.3. Compare the results to the tests used in the chapter to
analyze the association between birth weight and maternal weight gain
matched for pregnancy weight.
\item A total of $N=11$ matched sets of data are collected (one case and two
controls). For each infant with a birth defect (case) born in a rural area
in France, two infants (control) were selected who were born at essentially
the same time, in the same village and were the same sex. The matched data
consist of the distances to electromagnetic radiation exposure (risk
factor---measured in meters).
\begin{center}\begin{small}
\begin{tabular}[!h]{crrrrrrrrrrr}
\hskip-5em malformation & 1150 & 100 & 2000 & 350 & 400 & 2700 & 1200 & 1800 & 10 &
250 & 350 \\
\hskip-5em control 1 & 300 & 100 & 2150 & 1350 & 800 & 1250 & 450 & 400 & 900 &
1950 & 1050 \\
\hskip-5em control 2 & 750 & 650 & 4050 & 450 & 700 & 2850 & 50 & 2300 & 150 &
300 & 1000 \\
\end{tabular}
\end{small}\end{center}
Using these 1:2 matched sets assess the association between electromagnetic
radiation and birth defects (does the distances among cases differ from
distances among controls?).\\
Ignoring the matched data collection design, again evaluate the association
between electromagnetic radiation and birth defects. Does the matched
pattern of the data collection improve the efficiency of the analysis?
\item Simulate a data set of $n=100$ pairs of matched observations.\\
Demonstrate that the results using the binomial test (without a correction
factor) and the Friedman test are identical.\\
Simulate a data set of $n_1=100$ and $n_2=100$ observations from two
independent populations. Demonstrate that the results using the Wilcoxon
two-sample test (\texttt{pairs=F}) and the Kruskal-Wallis tests are
identical.
\end{enumerate}
\section{Rates, Life Tables, and Survival}
\emph{Problem set VIII}
\begin{enumerate}
\item Show\footnote{Solve theoretically and not with a computer program.} the
equivalence of the three expressions:
\[\begin{array}{rl}
\textrm{person-years} &= \delta_xP_{x+\delta_x}+\frac{1}{2}\delta_xD_x,\\
\textrm{person-years} &= \delta_xP_{x}-\frac{1}{2}\delta_xD_x,\\
\textrm{person-years} &= \frac{1}{2}\delta_x(P_{x+\delta_x}+P_x).\\
\end{array}\]
Show\footnote{Solve theoretically and not with a computer program.} that for
an exponential survival model
\[
S(T > t_2\mid T > t_1)=e^{-\lambda(t_2-t1)}.
\]
Show\footnote{Solve theoretically and not with a computer program.} that
when $\lambda_1(t)/\lambda_2(t)=c$, then $S_1(t)=[S_2(t)]^c$.
\item If the survival times from one group are $\{7.5, 12, 18, 33^+, 55.5,
61.5\}$ and for another group are $\{34.5, 60, 64.5, 76.5^+, 93\}$, show
that the log-rank test (i.e., \texttt{surv.diff()} function) gives
essentially the same results as the proportional hazards model (i.e.,
\texttt{coxreg()} function). The ``{\footnotesize +}'' indicates a censored survival time.
\item If $d$ observations are complete (not censored) in a sample of $n$
distinct survival times from exponentially distributed data, the likelihood
function is
\[
L=\prod_i\lambda e^{-\lambda t_i}\times \prod_j\lambda e^{-\lambda
t'_j}\quad i=1,2,3,\dots,d\; \textrm{and}\; j=1,2,3,\dots,n-d
\]
where $t_i$ represents complete observations and $t'_j$ represents censored
observations. Find the maximum likelihood estimate\footnote{Solve
theoretically and not with a computer program.} of $\lambda$. Verify this
estimator using the survival data from problem 2 and the \s-function
\texttt{ms()}.
\item If the survival function is $S(t)=1-t/b$ where $0\leq t\leq
b=\textrm{constant}$, find the hazard funciton $\lambda(t)$ and the
cumulative hazard function $H(t)$. Plot these three curves on a single
page. Derive an expression for the average rate ($R_t=$
deaths/person-years). Show\footnote{Solve theoretically and not with a
computer program.} that $R_t\approx\lambda(t)$ for small time intervals.
\item Consider the $n=11$ complete survival time $t=\{1, 4, 6, 8, 2, 12, 24,
23, 25, 27, 31\}$.\\
Use an \s-program to demonstrate that the Kaplan-Meier estimated mean
survival time is the same as the usual mean value $\bar t=\sum t_i/n$ and
the variance is $(n-1)\textrm{variance}(t)/n^2$.\\
Show \footnote{Solve theoretically and not with a computer program.}
algebrically that for complete survival data
\[
\bar t =\frac{1}{n}\sum t_i=\sum P_{i-1}(t_i-t_{i-1})
\]
where $i=1,2,3,\dots,n$.
\end{enumerate}
\end{document}