#-*- R -*- ## Script from Fourth Edition of `Modern Applied Statistics with S' # Chapter 6 Linear Statistical Models library(MASS) library(lattice) options(width=65, digits=5, height=9999) pdf(file="ch06.pdf", width=8, height=6, pointsize=9) options(contrasts = c("contr.helmert", "contr.poly")) # 6.1 A linear regression example xyplot(Gas ~ Temp | Insul, whiteside, panel = function(x, y, ...) { panel.xyplot(x, y, ...) panel.lmline(x, y, ...) }, xlab = "Average external temperature (deg. C)", ylab = "Gas consumption (1000 cubic feet)", aspect = "xy", strip = function(...) strip.default(..., style = 1)) gasB <- lm(Gas ~ Temp, data = whiteside, subset = Insul=="Before") gasA <- update(gasB, subset = Insul=="After") summary(gasB) summary(gasA) varB <- deviance(gasB)/gasB$df.resid # direct calculation varB <- summary(gasB)$sigma^2 # alternative gasBA <- lm(Gas ~ Insul/Temp - 1, data = whiteside) summary(gasBA) gasQ <- lm(Gas ~ Insul/(Temp + I(Temp^2)) - 1, data = whiteside) summary(gasQ)$coef # R: options(contrasts = c("contr.helmert", "contr.poly")) gasPR <- lm(Gas ~ Insul + Temp, data = whiteside) anova(gasPR, gasBA) oldcon <- options(contrasts = c("contr.treatment", "contr.poly")) gasBA1 <- lm(Gas ~ Insul*Temp, data = whiteside) summary(gasBA1)$coef options(oldcon) # 6.2 Model formulae and model matrices dat <- data.frame(a = factor(rep(1:3, 3)), y = rnorm(9, rep(2:4, 3), 0.1)) obj <- lm(y ~ a, dat) (alf.star <- coef(obj)) Ca <- contrasts(dat$a) # contrast matrix for `a' drop(Ca %*% alf.star[-1]) dummy.coef(obj) N <- factor(Nlevs <- c(0,1,2,4)) contrasts(N) contrasts(ordered(N)) N2 <- N contrasts(N2, 2) <- poly(Nlevs, 2) N2 <- C(N, poly(Nlevs, 2), 2) # alternative contrasts(N2) fractions(ginv(contr.helmert(n = 4))) Cp <- diag(-1, 4, 5); Cp[row(Cp) == col(Cp) - 1] <- 1 Cp fractions(ginv(Cp)) # 6.3 Regression diagnostics (hills.lm <- lm(time ~ dist + climb, data = hills)) frame() par(fig = c(0, 0.6, 0, 0.55)) plot(fitted(hills.lm), studres(hills.lm)) abline(h = 0, lty = 2) # identify(fitted(hills.lm), studres(hills.lm), row.names(hills)) par(fig = c(0.6, 1, 0, 0.55), pty = "s") qqnorm(studres(hills.lm)) qqline(studres(hills.lm)) par(pty = "m") hills.hat <- lm.influence(hills.lm)$hat cbind(hills, lev = hills.hat)[hills.hat > 3/35, ] cbind(hills, pred = predict(hills.lm))["Knock Hill", ] (hills1.lm <- update(hills.lm, subset = -18)) update(hills.lm, subset = -c(7, 18)) summary(hills1.lm) summary(update(hills1.lm, weights = 1/dist^2)) lm(time ~ -1 + dist + climb, hills[-18, ], weight = 1/dist^2) # hills <- hills # make a local copy (needed in S-PLUS) hills$ispeed <- hills$time/hills$dist hills$grad <- hills$climb/hills$dist (hills2.lm <- lm(ispeed ~ grad, data = hills[-18, ])) frame() par(fig = c(0, 0.6, 0, 0.55)) plot(hills$grad[-18], studres(hills2.lm), xlab = "grad") abline(h = 0, lty = 2) # identify(hills$grad[-18], studres(hills2.lm), row.names(hills)[-18]) par(fig = c(0.6, 1, 0, 0.55), pty = "s") qqnorm(studres(hills2.lm)) qqline(studres(hills2.lm)) par(pty = "m") hills2.hat <- lm.influence(hills2.lm)$hat cbind(hills[-18,], lev = hills2.hat)[hills2.hat > 1.8*2/34, ] # 6.4 Safe prediction quad1 <- lm(Weight ~ Days + I(Days^2), data = wtloss) quad2 <- lm(Weight ~ poly(Days, 2), data = wtloss) new.x <- data.frame(Days = seq(250, 300, 10), row.names = seq(250, 300, 10)) predict(quad1, newdata = new.x) predict(quad2, newdata = new.x) # predict.gam(quad2, newdata = new.x) # S-PLUS only # 6.5 Robust and resistant regression # library(lqs) phones.lm <- lm(calls ~ year, data = phones) attach(phones); plot(year, calls); detach() abline(phones.lm$coef) abline(rlm(calls ~ year, phones, maxit=50), lty = 2, col = 2) abline(lqs(calls ~ year, phones), lty =3, col = 3) # legend(locator(1), lty = 1:3, col = 1:3, # legend = c("least squares", "M-estimate", "LTS")) ## cor = FALSE is the default in R summary(lm(calls ~ year, data = phones)) summary(rlm(calls ~ year, maxit = 50, data = phones)) summary(rlm(calls ~ year, scale.est = "proposal 2", data = phones)) summary(rlm(calls ~ year, data = phones, psi = psi.bisquare)) lqs(calls ~ year, data = phones) lqs(calls ~ year, data = phones, method = "lms") lqs(calls ~ year, data = phones, method = "S") summary(rlm(calls ~ year, data = phones, method = "MM")) # library(robust) # S-PLUS only # phones.lmr <- lmRob(calls ~ year, data = phones) # summary(phones.lmr) # plot(phones.lmr) hills.lm hills1.lm # omitting Knock Hill rlm(time ~ dist + climb, data = hills) summary(rlm(time ~ dist + climb, data = hills, weights = 1/dist^2, method = "MM")) lqs(time ~ dist + climb, data = hills, nsamp = "exact") summary(hills2.lm) # omitting Knock Hill summary(rlm(ispeed ~ grad, data = hills)) summary(rlm(ispeed ~ grad, data = hills, method="MM")) # summary(lmRob(ispeed ~ grad, data = hills)) lqs(ispeed ~ grad, data = hills) # 6.6 Bootstrapping linear models library(boot) fit <- lm(calls ~ year, data = phones) ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit)) ph.fun <- function(data, i) { d <- data d$calls <- d$fitted + d$res[i] coef(update(fit, data=d)) } (ph.lm.boot <- boot(ph, ph.fun, R = 999)) fit <- rlm(calls ~ year, method = "MM", data = phones) ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit)) (ph.rlm.boot <- boot(ph, ph.fun, R = 999)) # 6.7 Factorial designs and designed experiments options(contrasts=c("contr.helmert", "contr.poly")) (npk.aov <- aov(yield ~ block + N*P*K, data = npk)) summary(npk.aov) alias(npk.aov) coef(npk.aov) options(contrasts=c("contr.treatment", "contr.poly")) npk.aov1 <- aov(yield ~ block + N + K, data = npk) summary.lm(npk.aov1) se.contrast(npk.aov1, list(N == "0", N == "1"), data = npk) model.tables(npk.aov1, type = "means", se = TRUE) mp <- c("-", "+") (NPK <- expand.grid(N = mp, P = mp, K = mp)) if(FALSE) { ## fac.design is part of S-PLUS. blocks13 <- fac.design(levels = c(2, 2, 2), factor= list(N=mp, P=mp, K=mp), rep = 3, fraction = 1/2) blocks46 <- fac.design(levels = c(2, 2, 2), factor = list(N=mp, P=mp, K=mp), rep = 3, fraction = ~ -N:P:K) NPK <- design(block = factor(rep(1:6, each = 4)), rbind(blocks13, blocks46)) i <- order(runif(6)[NPK$block], runif(24)) NPK <- NPK[i,] # Randomized lev <- rep(2, 7) factors <- list(S=mp, D=mp, H=mp, G=mp, R=mp, B=mp, P=mp) (Bike <- fac.design(lev, factors, fraction = ~ S:D:G + S:H:R + D:H:B + S:D:H:P)) replications(~ .^2, data=Bike) } if(require("FrF2")) { NPK <- FrF2(8, factor.names = c("N", "P", "K"), default.levels = 0:1, blocks = 2, replications = 3) print(NPK) print(as.data.frame(NPK)) print(Bike <- FrF2(factor.names = c("S", "D", "H", "G", "R", "B", "P"), default.levels = c("+", "-"), resolution = 3)) print(replications(~ .^2, data=Bike)) } # 6.8 An unbalanced four-way layout attach(quine) table(Lrn, Age, Sex, Eth) Means <- tapply(Days, list(Eth, Sex, Age, Lrn), mean) Vars <- tapply(Days, list(Eth, Sex, Age, Lrn), var) SD <- sqrt(Vars) par(mfrow = c(1, 2), pty="s") plot(Means, Vars, xlab = "Cell Means", ylab = "Cell Variances") plot(Means, SD, xlab = "Cell Means", ylab = "Cell Std Devn.") detach() ## singular.ok = TRUE is the default in R boxcox(Days+1 ~ Eth*Sex*Age*Lrn, data = quine, singular.ok = TRUE, lambda = seq(-0.05, 0.45, len = 20)) logtrans(Days ~ Age*Sex*Eth*Lrn, data = quine, alpha = seq(0.75, 6.5, len = 20), singular.ok = TRUE) quine.hi <- aov(log(Days + 2.5) ~ .^4, quine) quine.nxt <- update(quine.hi, . ~ . - Eth:Sex:Age:Lrn) dropterm(quine.nxt, test = "F") quine.lo <- aov(log(Days+2.5) ~ 1, quine) addterm(quine.lo, quine.hi, test = "F") quine.stp <- stepAIC(quine.nxt, scope = list(upper = ~Eth*Sex*Age*Lrn, lower = ~1), trace = FALSE) quine.stp$anova dropterm(quine.stp, test = "F") quine.3 <- update(quine.stp, . ~ . - Eth:Age:Lrn) dropterm(quine.3, test = "F") quine.4 <- update(quine.3, . ~ . - Eth:Age) quine.5 <- update(quine.4, . ~ . - Age:Lrn) dropterm(quine.5, test = "F") # 6.9 Predicting computer performance par(mfrow = c(1, 2), pty = "s") boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax, data = cpus, lambda = seq(0, 1, 0.1)) cpus1 <- cpus attach(cpus) for(v in names(cpus)[2:7]) cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])), include.lowest = TRUE) detach() boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax, data = cpus1, lambda = seq(-0.25, 1, 0.1)) par(mfrow = c(1, 1), pty = "m") set.seed(123) cpus2 <- cpus[, 2:8] # excludes names, authors' predictions cpus2[, 1:3] <- log10(cpus2[, 1:3]) #cpus.samp <- sample(1:209, 100) cpus.samp <- c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45, 46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86, 88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122, 124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151, 152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173, 174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198, 199, 202, 204, 205, 206, 208, 209) cpus.lm <- lm(log10(perf) ~ ., data = cpus2[cpus.samp, ]) test.cpus <- function(fit) sqrt(sum((log10(cpus2[-cpus.samp, "perf"]) - predict(fit, cpus2[-cpus.samp,]))^2)/109) test.cpus(cpus.lm) cpus.lm2 <- stepAIC(cpus.lm, trace=FALSE) cpus.lm2$anova test.cpus(cpus.lm2) # 6.10 Multiple comparisons immer.aov <- aov((Y1 + Y2)/2 ~ Var + Loc, data = immer) summary(immer.aov) model.tables(immer.aov, type = "means", se = TRUE, cterms = "Var") if(FALSE) { multicomp(immer.aov, plot = TRUE) oats1 <- aov(Y ~ N + V + B, data = oats) summary(oats1) multicomp(oats1, focus = "V") multicomp(oats1, focus = "N", comparisons = "mcc", control = 1) lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10), 0,0,0,-1,1,rep(0,9)),,3, dimnames = list(NULL, c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt"))) multicomp(oats1, lmat = lmat, bounds = "lower", comparisons = "none") } (tk <- TukeyHSD(immer.aov, which = "Var")) plot(tk) oats1 <- aov(Y ~ N + V + B, data = oats) (tk <- TukeyHSD(oats1, which = "V")) plot(tk) ## An alternative under R is to use package multcomp (which requires mvtnorm) ## This code is for multcomp >= 0.991-1 library(multcomp) ## next is slow: (tk <- confint(glht(immer.aov, linfct = mcp(Var = "Tukey")))) plot(tk) confint(glht(oats1, linfct = mcp(V = "Tukey"))) lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10), 0,0,0,-1,1,rep(0,9)),,3, dimnames = list(NULL, c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt"))) confint(glht(oats1, linfct = mcp(N = t(lmat[2:5, ])), alternative = "greater")) plot(tk) # End of ch06