284 lines
7.0 KiB
R
284 lines
7.0 KiB
R
|
#-*- R -*-
|
||
|
|
||
|
## Script from Fourth Edition of `Modern Applied Statistics with S'
|
||
|
|
||
|
# Chapter 5 Univariate Statistics
|
||
|
|
||
|
# for later use, from section 5.6
|
||
|
perm.t.test <- function(d) {
|
||
|
# ttest is function(x) mean(x)/sqrt(var(x)/length(x))
|
||
|
binary.v <- function(x, digits) {
|
||
|
if(missing(digits)) {
|
||
|
mx <- max(x)
|
||
|
digits <- if(mx > 0) 1 + floor(log(mx, base = 2)) else 1
|
||
|
}
|
||
|
ans <- 0:(digits - 1)
|
||
|
lx <- length(x)
|
||
|
x <- rep(x, rep(digits, lx))
|
||
|
x <- (x %/% 2^ans) %% 2
|
||
|
dim(x) <- c(digits, lx)
|
||
|
x
|
||
|
}
|
||
|
digits <- length(d)
|
||
|
n <- 2^digits
|
||
|
x <- d * 2 * (binary.v(1:n, digits) - 0.5)
|
||
|
mx <- matrix(1/digits, 1, digits) %*% x
|
||
|
s <- matrix(1/(digits - 1), 1, digits)
|
||
|
vx <- s %*% (x - matrix(mx, digits, n, byrow=TRUE))^2
|
||
|
as.vector(mx/sqrt(vx/digits))
|
||
|
}
|
||
|
|
||
|
library(MASS)
|
||
|
options(width=65, digits=5, height=9999)
|
||
|
library(lattice)
|
||
|
pdf(file="ch05.pdf", width=8, height=6, pointsize=9)
|
||
|
|
||
|
rm(A, B) # precautionary clear-out
|
||
|
attach(shoes)
|
||
|
tperm <- perm.t.test(B - A) # see section 5.6
|
||
|
detach()
|
||
|
|
||
|
# from ch04
|
||
|
if(!exists("fgl.df")) {
|
||
|
fgl0 <- fgl[ ,-10] # omit type.
|
||
|
fgl.df <- data.frame(type = rep(fgl$type, 9),
|
||
|
y = as.vector(as.matrix(fgl0)),
|
||
|
meas = factor(rep(1:9, each = 214), labels = names(fgl0)))
|
||
|
invisible()
|
||
|
}
|
||
|
|
||
|
|
||
|
# 5.1 Probability distributions
|
||
|
|
||
|
|
||
|
x <- rt(250, df = 9)
|
||
|
par(pty = "s")
|
||
|
qqnorm(x)
|
||
|
qqline(x)
|
||
|
par(pty = "m")
|
||
|
|
||
|
x <- rgamma(100, shape = 5, rate = 0.1)
|
||
|
fitdistr(x, "gamma")
|
||
|
x2 <- rt(250, df = 9)
|
||
|
fitdistr(x2, "t", df = 9)
|
||
|
fitdistr(x2, "t")
|
||
|
|
||
|
|
||
|
# 5.2 Generating random data
|
||
|
|
||
|
contam <- rnorm( 100, 0, (1 + 2*rbinom(100, 1, 0.05)) )
|
||
|
|
||
|
|
||
|
# 5.3 Data summaries
|
||
|
|
||
|
par(mfrow=c(2,3))
|
||
|
hist(geyser$duration, "scott", xlab="duration")
|
||
|
hist(chem, "scott")
|
||
|
hist(tperm, "scott")
|
||
|
hist(geyser$duration, "FD", xlab="duration")
|
||
|
hist(chem, "FD")
|
||
|
hist(tperm, "FD")
|
||
|
par(mfrow=c(1,1))
|
||
|
|
||
|
swiss.fertility <- swiss[, 1]
|
||
|
stem(swiss.fertility)
|
||
|
stem(chem)
|
||
|
stem(abbey)
|
||
|
stem(abbey, scale = 0.4) ## use scale = 0.4 in R
|
||
|
|
||
|
|
||
|
par(mfrow = c(1,2))
|
||
|
boxplot(chem, sub = "chem", range = 0.5)
|
||
|
boxplot(abbey, sub = "abbey")
|
||
|
par(mfrow = c(1,1))
|
||
|
|
||
|
bwplot(type ~ y | meas, data = fgl.df, scales = list(x="free"),
|
||
|
strip = function(...) strip.default(..., style=1), xlab = "")
|
||
|
|
||
|
|
||
|
# 5.4 Classical univariate statistics
|
||
|
|
||
|
attach(shoes)
|
||
|
t.test(A, mu = 10)
|
||
|
t.test(A)$conf.int
|
||
|
|
||
|
wilcox.test(A, mu = 10)
|
||
|
|
||
|
var.test(A, B)
|
||
|
|
||
|
t.test(A, B, var.equal = TRUE)
|
||
|
|
||
|
t.test(A, B, var.equal = FALSE)
|
||
|
|
||
|
wilcox.test(A, B)
|
||
|
|
||
|
t.test(A, B, paired = TRUE)
|
||
|
|
||
|
wilcox.test(A, B, paired = TRUE)
|
||
|
detach()
|
||
|
|
||
|
par(mfrow = c(1, 2))
|
||
|
truehist(tperm, xlab = "diff")
|
||
|
x <- seq(-4,4, 0.1)
|
||
|
lines(x, dt(x,9))
|
||
|
#cdf.compare(tperm, distribution = "t", df = 9)
|
||
|
sres <- c(sort(tperm), 4)
|
||
|
yres <- (0:1024)/1024
|
||
|
plot(sres, yres, type="S", xlab="diff", ylab="")
|
||
|
lines(x, pt(x,9), lty=3)
|
||
|
|
||
|
legend(-5, 1.05, c("Permutation dsn","t_9 cdf"), lty = c(1,3))
|
||
|
par(mfrow = c(1, 1))
|
||
|
|
||
|
|
||
|
# 5.5 Robust summaries
|
||
|
|
||
|
# Figure 5.7 was obtained by
|
||
|
x <- seq(-10, 10, len=500)
|
||
|
y <- dt(x, 25, log = TRUE)
|
||
|
z <- -diff(y)/diff(x)
|
||
|
plot(x[-1], z, type = "l", xlab = "", ylab = "psi")
|
||
|
y2 <- dt(x, 5, log = TRUE)
|
||
|
z2 <- -diff(y2)/diff(x)
|
||
|
lines(x[-1], z2, lty = 2)
|
||
|
|
||
|
|
||
|
sort(chem)
|
||
|
mean(chem)
|
||
|
median(chem)
|
||
|
#location.m(chem)
|
||
|
#location.m(chem, psi.fun="huber")
|
||
|
mad(chem)
|
||
|
#scale.tau(chem)
|
||
|
#scale.tau(chem, center=3.68)
|
||
|
unlist(huber(chem))
|
||
|
unlist(hubers(chem))
|
||
|
fitdistr(chem, "t", list(m = 3, s = 0.5), df = 5)
|
||
|
|
||
|
sort(abbey)
|
||
|
mean(abbey)
|
||
|
median(abbey)
|
||
|
#location.m(abbey)
|
||
|
#location.m(abbey, psi.fun="huber")
|
||
|
unlist(hubers(abbey))
|
||
|
unlist(hubers(abbey, k = 2))
|
||
|
unlist(hubers(abbey, k = 1))
|
||
|
fitdistr(abbey, "t", list(m = 12, s = 5), df = 10)
|
||
|
|
||
|
|
||
|
# 5.6 Density estimation
|
||
|
|
||
|
# Figure 5.8
|
||
|
attach(geyser)
|
||
|
par(mfrow=c(2,3))
|
||
|
truehist(duration, h=0.5, x0=0.0, xlim=c(0, 6), ymax=0.7)
|
||
|
truehist(duration, h=0.5, x0=0.1, xlim=c(0, 6), ymax=0.7)
|
||
|
truehist(duration, h=0.5, x0=0.2, xlim=c(0, 6), ymax=0.7)
|
||
|
truehist(duration, h=0.5, x0=0.3, xlim=c(0, 6), ymax=0.7)
|
||
|
truehist(duration, h=0.5, x0=0.4, xlim=c(0, 6), ymax=0.7)
|
||
|
|
||
|
breaks <- seq(0, 5.9, 0.1)
|
||
|
counts <- numeric(length(breaks))
|
||
|
for(i in (0:4)) counts[i+(1:55)] <- counts[i+(1:55)] +
|
||
|
rep(hist(duration, breaks=0.1*i + seq(0, 5.5, 0.5),
|
||
|
prob=TRUE, plot=FALSE)$density, rep(5,11))
|
||
|
plot(breaks+0.05, counts/5, type="l", xlab="duration",
|
||
|
ylab="averaged", bty="n", xlim=c(0, 6), ylim=c(0, 0.7))
|
||
|
detach()
|
||
|
|
||
|
|
||
|
attach(geyser)
|
||
|
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
|
||
|
lines(density(duration, width = "nrd"))
|
||
|
|
||
|
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
|
||
|
lines(density(duration, width = "SJ", n = 256), lty = 3)
|
||
|
lines(density(duration, n = 256, width = "SJ-dpi"), lty = 1)
|
||
|
detach()
|
||
|
|
||
|
gal <- galaxies/1000
|
||
|
plot(x = c(0, 40), y = c(0, 0.3), type = "n", bty = "l",
|
||
|
xlab = "velocity of galaxy (1000km/s)", ylab = "density")
|
||
|
rug(gal)
|
||
|
lines(density(gal, width = "SJ-dpi", n = 256), lty = 1)
|
||
|
lines(density(gal, width = "SJ", n = 256), lty = 3)
|
||
|
library(polspline)
|
||
|
x <- seq(5, 40, length = 500)
|
||
|
lines(x, doldlogspline(x, oldlogspline(gal)), lty = 2)
|
||
|
|
||
|
|
||
|
geyser2 <- data.frame(as.data.frame(geyser)[-1, ],
|
||
|
pduration = geyser$duration[-299])
|
||
|
attach(geyser2)
|
||
|
par(mfrow = c(2, 2))
|
||
|
plot(pduration, waiting, xlim = c(0.5, 6), ylim = c(40, 110),
|
||
|
xlab = "previous duration", ylab = "waiting")
|
||
|
f1 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110))
|
||
|
image(f1, zlim = c(0, 0.075),
|
||
|
xlab = "previous duration", ylab = "waiting")
|
||
|
f2 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110),
|
||
|
h = c(width.SJ(duration), width.SJ(waiting)) )
|
||
|
image(f2, zlim = c(0, 0.075),
|
||
|
xlab = "previous duration", ylab = "waiting")
|
||
|
persp(f2, phi = 30, theta = 20, d = 5,
|
||
|
xlab = "previous duration", ylab = "waiting", zlab = "")
|
||
|
detach()
|
||
|
|
||
|
density(gal, n = 1, from = 20.833, to = 20.834, width = "SJ")$y
|
||
|
1/(2 * sqrt(length(gal)) * 0.13)
|
||
|
|
||
|
set.seed(101)
|
||
|
m <- 1000
|
||
|
res <- numeric(m)
|
||
|
for (i in 1:m) res[i] <- median(sample(gal, replace = TRUE))
|
||
|
mean(res - median(gal))
|
||
|
sqrt(var(res))
|
||
|
|
||
|
truehist(res, h = 0.1)
|
||
|
lines(density(res, width = "SJ-dpi", n = 256))
|
||
|
quantile(res, p = c(0.025, 0.975))
|
||
|
x <- seq(19.5, 22.5, length = 500)
|
||
|
lines(x, doldlogspline(x, oldlogspline(res)), lty = 3)
|
||
|
|
||
|
library(boot)
|
||
|
set.seed(101)
|
||
|
gal.boot <- boot(gal, function(x, i) median(x[i]), R = 1000)
|
||
|
gal.boot
|
||
|
|
||
|
boot.ci(gal.boot, conf = c(0.90, 0.95),
|
||
|
type = c("norm","basic","perc","bca"))
|
||
|
plot(gal.boot)
|
||
|
|
||
|
if(FALSE) { # bootstrap() is an S-PLUS function
|
||
|
gal.bt <- bootstrap(gal, median, seed = 101, B = 1000)
|
||
|
summary(gal.bt)
|
||
|
plot(gal.bt)
|
||
|
qqnorm(gal.bt)
|
||
|
|
||
|
limits.emp(gal.bt)
|
||
|
limits.bca(gal.bt)
|
||
|
}
|
||
|
|
||
|
sim.gen <- function(data, mle) {
|
||
|
n <- length(data)
|
||
|
data[sample(n, replace = TRUE)] + mle*rnorm(n)
|
||
|
}
|
||
|
gal.boot2 <- boot(gal, median, R = 1000,
|
||
|
sim = "parametric", ran.gen = sim.gen, mle = 0.5)
|
||
|
boot.ci(gal.boot2, conf = c(0.90, 0.95),
|
||
|
type = c("norm","basic","perc"))
|
||
|
|
||
|
attach(shoes)
|
||
|
t.test(B - A)
|
||
|
shoes.boot <- boot(B - A, function(x,i) mean(x[i]), R = 1000)
|
||
|
boot.ci(shoes.boot, type = c("norm", "basic", "perc", "bca"))
|
||
|
mean.fun <- function(d, i) {
|
||
|
n <- length(i)
|
||
|
c(mean(d[i]), (n-1)*var(d[i])/n^2)
|
||
|
}
|
||
|
shoes.boot2 <- boot(B - A, mean.fun, R = 1000)
|
||
|
boot.ci(shoes.boot2, type = "stud")
|
||
|
detach()
|
||
|
|
||
|
# End of ch05
|