10.3 Solutions

10.3.1 Exercise 13.3 - Two-way contingency tables

cancer <- read.table("https://lbelzile.bitbucket.io/math341/cancer.dat", header = TRUE)
print(cancer)

##      age malignant yes no
## 1    <50       yes  26  9
## 2    <50        no  68  7
## 3    <50       yes  25  4
## 4    <50        no   9  3
## 5  50-69       yes  20  9
## 6  50-69        no  46  9
## 7  50-69       yes  18 11
## 8  50-69        no   5  2
## 9    70+       yes   1  2
## 10   70+        no   6  3
## 11   70+       yes   5  1
## 12   70+        no   1  0
## 13   <50       yes  11  6
## 14   <50        no  24  7
## 15   <50       yes   4  6
## 16   <50        no   0  0
## 17 50-69       yes  18  8
## 18 50-69        no  58 20
## 19 50-69       yes  10  3
## 20 50-69        no   3  2
## 21   70+       yes  15  9
## 22   70+        no  26 18
## 23   70+       yes   1  3
## 24   70+        no   1  0
## 25   <50       yes  16 16
## 26   <50        no  20  7
## 27   <50       yes   8  3
## 28   <50        no   1  0
## 29 50-69       yes  27 14
## 30 50-69        no  39 12
## 31 50-69       yes  10  3
## 32 50-69        no   4  0
## 33   70+       yes  12  3
## 34   70+        no  11  7
## 35   70+       yes   4  3
## 36   70+        no   1  0

# Some categories have small counts, so asymptotic result may be a bit off

cancer.m0 <- glm(cbind(yes, no) ~ 1, family = "binomial", data = cancer)
cancer.m1 <- glm(cbind(yes, no) ~ age, family = "binomial", data = cancer)
cancer.m2 <- glm(cbind(yes, no) ~ malignant, family = "binomial", data = cancer)
cancer.m3 <- glm(cbind(yes, no) ~ age + malignant, family = "binomial", data = cancer)
cancer.m4 <- glm(cbind(yes, no) ~ age * malignant, family = "binomial", data = cancer)
library(xtable)
devtab <- data.frame("model" = c("M0","M1","M2","M3"), 
                     "deviance" = round(c(deviance(cancer.m0), deviance(cancer.m1), 
                                          deviance(cancer.m2), deviance(cancer.m3)), 2),
      "p" = c(length(coef(cancer.m0)),length(coef(cancer.m1)),
              length(coef(cancer.m2)),length(coef(cancer.m3))))

devtab

##   model deviance p
## 1    M0    57.59 1
## 2    M1    50.44 3
## 3    M2    48.59 2
## 4    M3    41.06 4

Now that we have calculated the deviance of every model, we can perform an analysis of deviance and check whether the final model obtained by backward elimination is adequate by comparing its \(P\)-value under the null. Some counts in the age category 70+ are very low, hence the asymptotic result can be a bit off. We assess this through a small simulation study in which we resample observations from the fitted model.

# Saturated model vs additive model
 1- pchisq(deviance(cancer.m3), df = nrow(cancer) - length(cancer.m3$coef))

## [1] 0.1308746

# p-value of 0.78, fail to reject null that additive model is adequate
# Try further simplification
drop1(cancer.m3, test = "Chisq")

## Single term deletions
## 
## Model:
## cbind(yes, no) ~ age + malignant
##           Df Deviance    AIC    LRT Pr(>Chi)
## <none>         41.062 143.61                
## age        2   48.592 147.14 7.5296 0.023172
## malignant  1   50.435 150.99 9.3732 0.002202

# Fail to reject null that model with only "malignant" is adequate simplification
 1- pchisq(2*(c(logLik(cancer.m3) - logLik(cancer.m2))), df = (length(cancer.m3$coef) - length(cancer.m2$coef)))

## [1] 0.02317223

 1- pchisq(2*(c(logLik(cancer.m3) - logLik(cancer.m1))), df = (length(cancer.m3$coef) - length(cancer.m1$coef)))

## [1] 0.002201776

drop1(cancer.m2, test = "Chisq")

## Single term deletions
## 
## Model:
## cbind(yes, no) ~ malignant
##           Df Deviance    AIC    LRT Pr(>Chi)
## <none>         48.592 147.14                
## malignant  1   57.588 154.14 8.9965 0.002705

# Reject null that model with only intercept is adequate

# If Model is adequate, Deviance approx JK-p 
deviance(cancer.m2)

## [1] 48.59153

# Is this result vary large? Investigate via simulation study
# Canonical link functions for binomial
logit <- function(x){log(x) - log(1-x)}
# Inverse link function (logistic in course notes)
expit <- function(x){ 1/(1+ exp(-x))}
# Get predicted probability
probc <- expit(predict(cancer.m2))
# Need to condition on total count - fixed for binomial model
nr <- rowSums(cancer[,c("yes","no")])
# Simulate new datasets from the model, compute their deviance
simudev <- rep(0, 1e3L)
for(i in 1:length(simudev)){
  newyes <- sapply(1:length(nr), function(j){rbinom(n = 1, size = nr[j], prob = probc[j])})
  newcancer <- data.frame(malignant = cancer$malignant, yes = newyes, no = nr - newyes)
  simudev[i] <- deviance(glm( cbind(yes, no) ~ malignant, family = "binomial", data = newcancer))
}
# Distribution of deviance from simulated model, conditional on row total
hist(simudev, main = "Deviance of simulated models", xlab = "Deviance", probability = TRUE, ylim = c(0, 0.2), breaks = 20)
# Line corresponding to deviance of 
abline(v = deviance(cancer.m2), lty = 2)
# Null distribution
curve(dchisq(x, df = cancer.m2$df.residual), col = 2, add = TRUE)

# P-value from simulation
sum(I(deviance(cancer.m2) < simudev)) / length(simudev)

## [1] 0.06

# P-value with asymptotic distribution
1-pchisq(deviance(cancer.m2), df = (nrow(cancer) - length(coef(cancer.m2))))

## [1] 0.05010706

# About 4% difference between the two

10.3.2 Exercise 13.5 - Equivalence of binomial and Poisson models

We can fit the model using the Poisson generalized linear model with an offset term for log(pop), since the latter is fixed. We cannot make direct comparisons because the population size in each category are different. If \(X \sim \mathcal{B}(m_i, \pi_i)\), the Poisson approximation is \(X \sim \mathcal{P}(\lambda_i)\) with \(\lambda_i \approx m_i\pi_i\). Taking logarithms, we get \(\log(\lambda_i) = \log(m_i) + \log(\pi_i)\). The term \(\log(m_i)\) is an offset with known coefficient of 1.

smoking <- read.table("https://lbelzile.bitbucket.io/math341/smoking.dat", header = TRUE)
smoking.p.m0 <- glm(dead ~ offset(log(pop)), family = poisson, data = smoking)
smoking.p.m1 <- glm(dead ~ offset(log(pop)) + smoke, family = poisson, data = smoking)
smoking.p.m2 <- glm(dead ~ offset(log(pop)) + age, family = poisson, data = smoking)
smoking.p.m3 <- glm(dead ~ offset(log(pop)) + smoke + age, family = poisson, data = smoking)
#Define quantities
n <- nrow(smoking)
p0 <- length(coef(smoking.p.m0)); D0p <- deviance(smoking.p.m0)
p1 <- length(coef(smoking.p.m1)); D1p <- deviance(smoking.p.m1)
p2 <- length(coef(smoking.p.m2)); D2p <- deviance(smoking.p.m2)
p3 <- length(coef(smoking.p.m3)); D3p <- deviance(smoking.p.m3)

#Analysis of deviance
1 - pchisq(D3p, df = n - p3) # Interaction not stat. significative

## [1] 0.609872

1 - pchisq(D2p - D3p, df = p3 - p2) # smoke significative

## [1] 0

1 - pchisq(D1p - D3p, df = p3 - p1) # age significative

## [1] 0

#If Model is correct, D3 approx chisq(n - p3)
summary(smoking.p.m3)

## 
## Call:
## glm(formula = dead ~ offset(log(pop)) + smoke + age, family = poisson, 
##     data = smoking)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.06055  -0.54773   0.06431   0.29963   1.48348  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -3.63222    0.06783 -53.552  < 2e-16
## smokecigarretteOnly  0.36915    0.03791   9.737  < 2e-16
## smokecigarrettePlus  0.17015    0.03643   4.671 3.00e-06
## smokeno             -0.04781    0.04699  -1.017    0.309
## age45-49             0.55388    0.07999   6.924 4.38e-12
## age50-54             0.98039    0.07682  12.762  < 2e-16
## age55-59             1.37946    0.06526  21.138  < 2e-16
## age60-64             1.65423    0.06257  26.439  < 2e-16
## age65-69             1.99817    0.06279  31.824  < 2e-16
## age70-74             2.27141    0.06435  35.296  < 2e-16
## age75-79             2.55858    0.06778  37.746  < 2e-16
## age80+               2.84692    0.07242  39.310  < 2e-16
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   21.487  on 24  degrees of freedom
## AIC: 285.51
## 
## Number of Fisher Scoring iterations: 4

#Same with binomial model
smoking.b.m0 <- glm(cbind(dead, pop - dead) ~ 1, family = binomial, data = smoking)
smoking.b.m1 <- glm(cbind(dead, pop - dead) ~ smoke, family = binomial, data = smoking)
smoking.b.m2 <- glm(cbind(dead, pop - dead) ~ age, family = binomial, data = smoking)
smoking.b.m3 <- glm(cbind(dead, pop - dead) ~ smoke + age, family = binomial, data = smoking)
#Define quantities
n <- nrow(smoking)
D0b <- deviance(smoking.b.m0)
D1b <- deviance(smoking.b.m1)
D2b <- deviance(smoking.b.m2)
D3b <- deviance(smoking.b.m3)

1 - pchisq(D3b, df = n - p3) # Interaction not stat. significative

## [1] 0.5530777

1 - pchisq(D2b - D3b, df = p3 - p2) # smoking group significative

## [1] 0

1 - pchisq(D1b - D3b, df = p3 - p1) # age group significative

## [1] 0

#If Model is correct, D3 approx n - p3
summary(smoking.b.m3)

## 
## Call:
## glm(formula = cbind(dead, pop - dead) ~ smoke + age, family = binomial, 
##     data = smoking)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79861  -0.61857   0.03789   0.45630   1.84921  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -3.69508    0.07224 -51.152  < 2e-16
## smokecigarretteOnly  0.50308    0.04494  11.194  < 2e-16
## smokecigarrettePlus  0.24727    0.04307   5.740 9.44e-09
## smokeno             -0.04696    0.05479  -0.857    0.391
## age45-49             0.58001    0.08191   7.081 1.43e-12
## age50-54             1.03983    0.07921  13.128  < 2e-16
## age55-59             1.49243    0.06718  22.216  < 2e-16
## age60-64             1.82007    0.06438  28.270  < 2e-16
## age65-69             2.25585    0.06511  34.647  < 2e-16
## age70-74             2.63304    0.06785  38.808  < 2e-16
## age75-79             3.07094    0.07423  41.368  < 2e-16
## age80+               3.56348    0.08433  42.259  < 2e-16
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4917.031  on 35  degrees of freedom
## Residual deviance:   22.439  on 24  degrees of freedom
## AIC: 277.38
## 
## Number of Fisher Scoring iterations: 4

# Output deviance table
devtab <- data.frame("model" = c("M0","M1","M2","M3"), 
                     "deviance binom." = round(c(D0p, D1p, D2p, D3p), 2),
                     "deviance Poisson" = round(c(D0b, D1b, D2b, D3b), 2),
                     "p" = c(p0, p1, p2, p3))
print(devtab)

##   model deviance.binom. deviance.Poisson  p
## 1    M0         4055.98          4917.03  1
## 2    M1         3910.70          4740.34  4
## 3    M2          191.72           247.94  9
## 4    M3           21.49            22.44 12

#To export to LaTeX
#xtab <- xtable(devtab, caption = "Analysis of deviance for the \\texttt{smoking} data set")
#print(xtab, booktabs = TRUE, sanitize.text.function = identity, include.rownames = FALSE, )

# Compare fitted probabilities
par(mar = c(6,6,3,3))
plot(fitted(smoking.b.m3), fitted(smoking.b.m3) - fitted(smoking.p.m3)/smoking$pop,
     xlab = "Fitted probability of death\n from logistic model",
     ylab = "Difference in fitted probability of death \nbetween Poisson and logistic models",
     main = "Smoking cancer dataset", bty = "l")

#Differ most for last categories in which the Poisson approximation is dubious