359 lines
11 KiB
R
359 lines
11 KiB
R
#-*- R -*-
|
|
|
|
## Script from Fourth Edition of `Modern Applied Statistics with S'
|
|
|
|
# Chapter 6 Linear Statistical Models
|
|
|
|
library(MASS)
|
|
library(lattice)
|
|
options(width=65, digits=5, height=9999)
|
|
pdf(file="ch06.pdf", width=8, height=6, pointsize=9)
|
|
options(contrasts = c("contr.helmert", "contr.poly"))
|
|
|
|
|
|
# 6.1 A linear regression example
|
|
|
|
xyplot(Gas ~ Temp | Insul, whiteside, panel =
|
|
function(x, y, ...) {
|
|
panel.xyplot(x, y, ...)
|
|
panel.lmline(x, y, ...)
|
|
}, xlab = "Average external temperature (deg. C)",
|
|
ylab = "Gas consumption (1000 cubic feet)", aspect = "xy",
|
|
strip = function(...) strip.default(..., style = 1))
|
|
|
|
|
|
gasB <- lm(Gas ~ Temp, data = whiteside, subset = Insul=="Before")
|
|
gasA <- update(gasB, subset = Insul=="After")
|
|
|
|
summary(gasB)
|
|
summary(gasA)
|
|
|
|
varB <- deviance(gasB)/gasB$df.resid # direct calculation
|
|
varB <- summary(gasB)$sigma^2 # alternative
|
|
|
|
gasBA <- lm(Gas ~ Insul/Temp - 1, data = whiteside)
|
|
summary(gasBA)
|
|
|
|
gasQ <- lm(Gas ~ Insul/(Temp + I(Temp^2)) - 1, data = whiteside)
|
|
summary(gasQ)$coef
|
|
|
|
# R: options(contrasts = c("contr.helmert", "contr.poly"))
|
|
gasPR <- lm(Gas ~ Insul + Temp, data = whiteside)
|
|
anova(gasPR, gasBA)
|
|
|
|
oldcon <- options(contrasts = c("contr.treatment", "contr.poly"))
|
|
gasBA1 <- lm(Gas ~ Insul*Temp, data = whiteside)
|
|
summary(gasBA1)$coef
|
|
options(oldcon)
|
|
|
|
|
|
# 6.2 Model formulae and model matrices
|
|
|
|
dat <- data.frame(a = factor(rep(1:3, 3)),
|
|
y = rnorm(9, rep(2:4, 3), 0.1))
|
|
obj <- lm(y ~ a, dat)
|
|
(alf.star <- coef(obj))
|
|
Ca <- contrasts(dat$a) # contrast matrix for `a'
|
|
drop(Ca %*% alf.star[-1])
|
|
dummy.coef(obj)
|
|
|
|
|
|
N <- factor(Nlevs <- c(0,1,2,4))
|
|
contrasts(N)
|
|
contrasts(ordered(N))
|
|
|
|
N2 <- N
|
|
contrasts(N2, 2) <- poly(Nlevs, 2)
|
|
N2 <- C(N, poly(Nlevs, 2), 2) # alternative
|
|
contrasts(N2)
|
|
|
|
fractions(ginv(contr.helmert(n = 4)))
|
|
|
|
Cp <- diag(-1, 4, 5); Cp[row(Cp) == col(Cp) - 1] <- 1
|
|
Cp
|
|
fractions(ginv(Cp))
|
|
|
|
|
|
# 6.3 Regression diagnostics
|
|
|
|
(hills.lm <- lm(time ~ dist + climb, data = hills))
|
|
frame()
|
|
par(fig = c(0, 0.6, 0, 0.55))
|
|
plot(fitted(hills.lm), studres(hills.lm))
|
|
abline(h = 0, lty = 2)
|
|
# identify(fitted(hills.lm), studres(hills.lm), row.names(hills))
|
|
par(fig = c(0.6, 1, 0, 0.55), pty = "s")
|
|
qqnorm(studres(hills.lm))
|
|
qqline(studres(hills.lm))
|
|
par(pty = "m")
|
|
hills.hat <- lm.influence(hills.lm)$hat
|
|
cbind(hills, lev = hills.hat)[hills.hat > 3/35, ]
|
|
cbind(hills, pred = predict(hills.lm))["Knock Hill", ]
|
|
(hills1.lm <- update(hills.lm, subset = -18))
|
|
update(hills.lm, subset = -c(7, 18))
|
|
summary(hills1.lm)
|
|
summary(update(hills1.lm, weights = 1/dist^2))
|
|
lm(time ~ -1 + dist + climb, hills[-18, ], weight = 1/dist^2)
|
|
|
|
# hills <- hills # make a local copy (needed in S-PLUS)
|
|
hills$ispeed <- hills$time/hills$dist
|
|
hills$grad <- hills$climb/hills$dist
|
|
(hills2.lm <- lm(ispeed ~ grad, data = hills[-18, ]))
|
|
frame()
|
|
par(fig = c(0, 0.6, 0, 0.55))
|
|
plot(hills$grad[-18], studres(hills2.lm), xlab = "grad")
|
|
abline(h = 0, lty = 2)
|
|
# identify(hills$grad[-18], studres(hills2.lm), row.names(hills)[-18])
|
|
par(fig = c(0.6, 1, 0, 0.55), pty = "s")
|
|
qqnorm(studres(hills2.lm))
|
|
qqline(studres(hills2.lm))
|
|
par(pty = "m")
|
|
hills2.hat <- lm.influence(hills2.lm)$hat
|
|
cbind(hills[-18,], lev = hills2.hat)[hills2.hat > 1.8*2/34, ]
|
|
|
|
|
|
# 6.4 Safe prediction
|
|
|
|
quad1 <- lm(Weight ~ Days + I(Days^2), data = wtloss)
|
|
quad2 <- lm(Weight ~ poly(Days, 2), data = wtloss)
|
|
|
|
new.x <- data.frame(Days = seq(250, 300, 10),
|
|
row.names = seq(250, 300, 10))
|
|
|
|
predict(quad1, newdata = new.x)
|
|
predict(quad2, newdata = new.x)
|
|
|
|
# predict.gam(quad2, newdata = new.x) # S-PLUS only
|
|
|
|
|
|
# 6.5 Robust and resistant regression
|
|
|
|
# library(lqs)
|
|
phones.lm <- lm(calls ~ year, data = phones)
|
|
attach(phones); plot(year, calls); detach()
|
|
abline(phones.lm$coef)
|
|
abline(rlm(calls ~ year, phones, maxit=50), lty = 2, col = 2)
|
|
abline(lqs(calls ~ year, phones), lty =3, col = 3)
|
|
# legend(locator(1), lty = 1:3, col = 1:3,
|
|
# legend = c("least squares", "M-estimate", "LTS"))
|
|
|
|
## cor = FALSE is the default in R
|
|
summary(lm(calls ~ year, data = phones))
|
|
summary(rlm(calls ~ year, maxit = 50, data = phones))
|
|
summary(rlm(calls ~ year, scale.est = "proposal 2", data = phones))
|
|
summary(rlm(calls ~ year, data = phones, psi = psi.bisquare))
|
|
|
|
lqs(calls ~ year, data = phones)
|
|
lqs(calls ~ year, data = phones, method = "lms")
|
|
lqs(calls ~ year, data = phones, method = "S")
|
|
|
|
summary(rlm(calls ~ year, data = phones, method = "MM"))
|
|
|
|
# library(robust) # S-PLUS only
|
|
# phones.lmr <- lmRob(calls ~ year, data = phones)
|
|
# summary(phones.lmr)
|
|
# plot(phones.lmr)
|
|
|
|
hills.lm
|
|
hills1.lm # omitting Knock Hill
|
|
rlm(time ~ dist + climb, data = hills)
|
|
summary(rlm(time ~ dist + climb, data = hills,
|
|
weights = 1/dist^2, method = "MM"))
|
|
lqs(time ~ dist + climb, data = hills, nsamp = "exact")
|
|
summary(hills2.lm) # omitting Knock Hill
|
|
summary(rlm(ispeed ~ grad, data = hills))
|
|
summary(rlm(ispeed ~ grad, data = hills, method="MM"))
|
|
# summary(lmRob(ispeed ~ grad, data = hills))
|
|
|
|
lqs(ispeed ~ grad, data = hills)
|
|
|
|
|
|
# 6.6 Bootstrapping linear models
|
|
|
|
library(boot)
|
|
fit <- lm(calls ~ year, data = phones)
|
|
ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit))
|
|
ph.fun <- function(data, i) {
|
|
d <- data
|
|
d$calls <- d$fitted + d$res[i]
|
|
coef(update(fit, data=d))
|
|
}
|
|
(ph.lm.boot <- boot(ph, ph.fun, R = 999))
|
|
|
|
fit <- rlm(calls ~ year, method = "MM", data = phones)
|
|
ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit))
|
|
(ph.rlm.boot <- boot(ph, ph.fun, R = 999))
|
|
|
|
|
|
# 6.7 Factorial designs and designed experiments
|
|
|
|
options(contrasts=c("contr.helmert", "contr.poly"))
|
|
(npk.aov <- aov(yield ~ block + N*P*K, data = npk))
|
|
summary(npk.aov)
|
|
|
|
alias(npk.aov)
|
|
coef(npk.aov)
|
|
|
|
options(contrasts=c("contr.treatment", "contr.poly"))
|
|
npk.aov1 <- aov(yield ~ block + N + K, data = npk)
|
|
summary.lm(npk.aov1)
|
|
se.contrast(npk.aov1, list(N == "0", N == "1"), data = npk)
|
|
model.tables(npk.aov1, type = "means", se = TRUE)
|
|
|
|
mp <- c("-", "+")
|
|
(NPK <- expand.grid(N = mp, P = mp, K = mp))
|
|
|
|
if(FALSE) { ## fac.design is part of S-PLUS.
|
|
blocks13 <- fac.design(levels = c(2, 2, 2),
|
|
factor= list(N=mp, P=mp, K=mp), rep = 3, fraction = 1/2)
|
|
|
|
blocks46 <- fac.design(levels = c(2, 2, 2),
|
|
factor = list(N=mp, P=mp, K=mp), rep = 3, fraction = ~ -N:P:K)
|
|
|
|
NPK <- design(block = factor(rep(1:6, each = 4)),
|
|
rbind(blocks13, blocks46))
|
|
i <- order(runif(6)[NPK$block], runif(24))
|
|
NPK <- NPK[i,] # Randomized
|
|
|
|
lev <- rep(2, 7)
|
|
factors <- list(S=mp, D=mp, H=mp, G=mp, R=mp, B=mp, P=mp)
|
|
(Bike <- fac.design(lev, factors,
|
|
fraction = ~ S:D:G + S:H:R + D:H:B + S:D:H:P))
|
|
replications(~ .^2, data=Bike)
|
|
}
|
|
|
|
if(require("FrF2")) {
|
|
NPK <- FrF2(8, factor.names = c("N", "P", "K"), default.levels = 0:1,
|
|
blocks = 2, replications = 3)
|
|
print(NPK)
|
|
print(as.data.frame(NPK))
|
|
|
|
print(Bike <- FrF2(factor.names = c("S", "D", "H", "G", "R", "B", "P"),
|
|
default.levels = c("+", "-"), resolution = 3))
|
|
print(replications(~ .^2, data=Bike))
|
|
}
|
|
|
|
# 6.8 An unbalanced four-way layout
|
|
|
|
attach(quine)
|
|
table(Lrn, Age, Sex, Eth)
|
|
|
|
Means <- tapply(Days, list(Eth, Sex, Age, Lrn), mean)
|
|
Vars <- tapply(Days, list(Eth, Sex, Age, Lrn), var)
|
|
SD <- sqrt(Vars)
|
|
par(mfrow = c(1, 2), pty="s")
|
|
plot(Means, Vars, xlab = "Cell Means", ylab = "Cell Variances")
|
|
plot(Means, SD, xlab = "Cell Means", ylab = "Cell Std Devn.")
|
|
detach()
|
|
|
|
## singular.ok = TRUE is the default in R
|
|
boxcox(Days+1 ~ Eth*Sex*Age*Lrn, data = quine, singular.ok = TRUE,
|
|
lambda = seq(-0.05, 0.45, len = 20))
|
|
|
|
logtrans(Days ~ Age*Sex*Eth*Lrn, data = quine,
|
|
alpha = seq(0.75, 6.5, len = 20), singular.ok = TRUE)
|
|
|
|
quine.hi <- aov(log(Days + 2.5) ~ .^4, quine)
|
|
quine.nxt <- update(quine.hi, . ~ . - Eth:Sex:Age:Lrn)
|
|
dropterm(quine.nxt, test = "F")
|
|
|
|
quine.lo <- aov(log(Days+2.5) ~ 1, quine)
|
|
addterm(quine.lo, quine.hi, test = "F")
|
|
|
|
quine.stp <- stepAIC(quine.nxt,
|
|
scope = list(upper = ~Eth*Sex*Age*Lrn, lower = ~1),
|
|
trace = FALSE)
|
|
quine.stp$anova
|
|
|
|
dropterm(quine.stp, test = "F")
|
|
quine.3 <- update(quine.stp, . ~ . - Eth:Age:Lrn)
|
|
dropterm(quine.3, test = "F")
|
|
quine.4 <- update(quine.3, . ~ . - Eth:Age)
|
|
quine.5 <- update(quine.4, . ~ . - Age:Lrn)
|
|
dropterm(quine.5, test = "F")
|
|
|
|
|
|
# 6.9 Predicting computer performance
|
|
|
|
par(mfrow = c(1, 2), pty = "s")
|
|
boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax,
|
|
data = cpus, lambda = seq(0, 1, 0.1))
|
|
|
|
cpus1 <- cpus
|
|
attach(cpus)
|
|
for(v in names(cpus)[2:7])
|
|
cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])),
|
|
include.lowest = TRUE)
|
|
detach()
|
|
boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax,
|
|
data = cpus1, lambda = seq(-0.25, 1, 0.1))
|
|
par(mfrow = c(1, 1), pty = "m")
|
|
|
|
set.seed(123)
|
|
cpus2 <- cpus[, 2:8] # excludes names, authors' predictions
|
|
cpus2[, 1:3] <- log10(cpus2[, 1:3])
|
|
#cpus.samp <- sample(1:209, 100)
|
|
cpus.samp <-
|
|
c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45,
|
|
46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86,
|
|
88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122,
|
|
124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151,
|
|
152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173,
|
|
174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198,
|
|
199, 202, 204, 205, 206, 208, 209)
|
|
|
|
cpus.lm <- lm(log10(perf) ~ ., data = cpus2[cpus.samp, ])
|
|
test.cpus <- function(fit)
|
|
sqrt(sum((log10(cpus2[-cpus.samp, "perf"]) -
|
|
predict(fit, cpus2[-cpus.samp,]))^2)/109)
|
|
test.cpus(cpus.lm)
|
|
cpus.lm2 <- stepAIC(cpus.lm, trace=FALSE)
|
|
cpus.lm2$anova
|
|
test.cpus(cpus.lm2)
|
|
|
|
|
|
# 6.10 Multiple comparisons
|
|
|
|
immer.aov <- aov((Y1 + Y2)/2 ~ Var + Loc, data = immer)
|
|
summary(immer.aov)
|
|
|
|
model.tables(immer.aov, type = "means", se = TRUE, cterms = "Var")
|
|
|
|
if(FALSE) {
|
|
multicomp(immer.aov, plot = TRUE)
|
|
|
|
oats1 <- aov(Y ~ N + V + B, data = oats)
|
|
summary(oats1)
|
|
multicomp(oats1, focus = "V")
|
|
multicomp(oats1, focus = "N", comparisons = "mcc", control = 1)
|
|
lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10),
|
|
0,0,0,-1,1,rep(0,9)),,3,
|
|
dimnames = list(NULL,
|
|
c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt")))
|
|
multicomp(oats1, lmat = lmat, bounds = "lower", comparisons = "none")
|
|
}
|
|
|
|
(tk <- TukeyHSD(immer.aov, which = "Var"))
|
|
plot(tk)
|
|
|
|
oats1 <- aov(Y ~ N + V + B, data = oats)
|
|
(tk <- TukeyHSD(oats1, which = "V"))
|
|
plot(tk)
|
|
|
|
## An alternative under R is to use package multcomp (which requires mvtnorm)
|
|
## This code is for multcomp >= 0.991-1
|
|
library(multcomp)
|
|
## next is slow:
|
|
(tk <- confint(glht(immer.aov, linfct = mcp(Var = "Tukey"))))
|
|
plot(tk)
|
|
|
|
confint(glht(oats1, linfct = mcp(V = "Tukey")))
|
|
lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10),
|
|
0,0,0,-1,1,rep(0,9)),,3,
|
|
dimnames = list(NULL,
|
|
c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt")))
|
|
confint(glht(oats1, linfct = mcp(N = t(lmat[2:5, ])), alternative = "greater"))
|
|
plot(tk)
|
|
|
|
# End of ch06
|