2025-01-12 04:36:52 +08:00

257 lines
9.5 KiB
R

options(na.action=na.exclude) # preserve missings
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
library(survival)
#
# Reproduce example 1 in the SAS lifereg documentation
#
# this fit doesn't give the same log-lik that they claim
fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor,
subset=(temp>150), dist='lognormal')
summary(fit1)
# This one, with the loglik on the transformed scale (the inappropriate
# scale, Ripley & Venables would argue) does agree.
# All coefs are of course identical.
fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor,
subset=(temp>150), dist='gaussian')
# Give the quantile estimates, which is the lower half of "output 48.1.5"
# in the SAS 9.2 manual
pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
type='quantile', se=T)
pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
type='uquantile', se=T)
pp1
temp130 <- matrix(0, nrow=3, ncol=6)
temp130[,1] <- pp1$fit[1,]
temp130[,2] <- pp1$se.fit[1,]
temp130[,3] <- pp2$fit[1,]
temp130[,4] <- pp2$se.fit[1,]
temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,])
temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,])
dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"),
c("Time", "se(time)", "log(time)", "se[log(time)]",
"lower 90", "upper 90"))
print(temp130)
# A set of examples, copied from the manual pages of SAS procedure
# "reliability", which is part of their QC product.
#
color <- c("black", "red", "green", "blue", "magenta", "red4",
"orange", "DarkGreen", "cyan2", "DarkViolet")
palette(color)
pdf(file='reliability.pdf')
#
# Insulating fluids example
#
# Adding a -1 to the fit just causes the each group to have it's own
# intercept, rather than a global intercept + constrasts. The strata
# statement allows each to have a separate scale
ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid)
# Get predicted quantiles at each of the voltages
# By default predict() would give a line of results for each observation,
# I only want the unique set of x's, i.e., only 4 cases
uvolt <- sort(unique(ifluid$voltage)) #the unique levels
plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100
pred <- predict(ffit, type='quantile', p=plist,
newdata=data.frame(voltage=uvolt))
tfun <- function(x) log(-log(1-x))
matplot(t(pred), tfun(plist), type='l', log='x', lty=1,
col=1:4, yaxt='n',
xlab="Predicted", ylab="Quantile")
axis(2, tfun(plist), format(100*plist), adj=1)
kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit
for (i in 1:4) {
temp <- kfit[i]
points(temp$time, tfun(1-temp$surv), col=i, pch=i)
}
# Now a table
temp <- array(0, dim=c(4,4,4)) #4 groups by 4 parameters by 4 stats
temp[,1,1] <- ffit$coef # "EV Location" in SAS manual
temp[,2,1] <- ffit$scale # "EV scale"
temp[,3,1] <- exp(ffit$coef) # "Weibull Scale"
temp[,4,1] <- 1/ffit$scale # "Weibull Shape"
temp[,1,2] <- sqrt(diag(ffit$var))[1:4] #standard error
temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale
temp[,3,2] <- temp[,1,2] * temp[,3,1]
temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2
temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2] #lower conf limits
temp[,1,4] <- temp[,1,1] + 1.96*temp[,1,2] # upper
# log(scale) is the natural parameter, in which the routine did its fitting
# and on which the std errors were computed
temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8])
temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8])
temp[,3, 3:4] <- exp(temp[,1,3:4])
temp[,4, 3:4] <- 1/temp[,2,4:3]
dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale",
"Weibull shape"),
c("Estimate", "SE", "lower 95% CI", "uppper 95% CI"))
print(aperm(temp, c(2,3,1)), digits=5)
rm(temp, uvolt, plist, pred, ffit, kfit)
#####################################################################
# Turbine cracks data
crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA),
n=c(fail, 167-sum(fail))))
cfit <- survreg(Surv(day1, day2, type='interval2') ~1,
dist='weibull', data=crack2, weights=n)
summary(cfit)
#Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale)
# Draw the SAS plot
# The "type=fleming" argument reflects that they estimate hazards rather than
# survival, and forces a Nelson-Aalen hazard estimate
#
plist <- c(1, 2, 5, 1:8 *10)/100
plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x',
yaxt='n', type='l',
xlab="Weibull Plot for Time", ylab="Percent")
axis(2, tfun(plist), format(100*plist), adj=1)
kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2,
weight=n, type='fleming')
# Only plot point where n.event > 0
# Why? I'm trying to match them. Personally, all should be plotted.
who <- (kfit$n.event > 0)
points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+')
points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-')
points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-')
text(rep(3,6), seq(.5, -1.0, length=6),
c("Scale", "Shape", "Right Censored", "Left Censored",
"Interval Censored", "Fit"), adj=0)
text(rep(9,6), seq(.5, -1.0, length=6),
c(format(round(exp(cfit$coef), 2)),
format(round(1/cfit$scale, 2)),
format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1)
# Now a portion of his percentiles table
# I don't get the same SE as SAS, I haven't checked out why. The
# estimates and se for the underlying Weibull model are the same.
temp <- predict(cfit, type='quantile', p=plist, se=T)
tempse <- sqrt(temp$se[1,])
mat <- cbind(temp$fit[1,], tempse,
temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse)
dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95"))
print(mat)
#
# The cracks data has a particularly easy estimate, so use
# it to double check code
time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2,
crack2$day1[9])
cdf <- cumsum(crack2$n)/sum(crack2$n)
all.equal(kfit$time, time)
all.equal(kfit$surv, 1-cdf[c(1:8,8)])
rm(time, cdf, kfit)
#######################################################
#
# Replacement of valve seats in diesel engines
# The input data has id, time, and an indicator of whether there was an
# event at that time: 0=no, 1=yes. No one has an event at their last time.
# The input data has two engines with dual failures: 328 loses 2 valves at
# time 653, and number 402 loses 2 at time 139. For each, fudge the first
# time to be .1 days earlier.
#
ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0)
temp <- valveSeat$time
temp[ties] <- temp[ties] - .1
n <- length(temp)
first <- !duplicated(valveSeat$id)
vtemp <- with(valveSeat, data.frame(id =id,
time1= ifelse(first, 0, c(0, temp[-n])),
time2= temp, status=status))
kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id)
plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time')
title("Valve replacement data")
# The summary.survfit function doesn't have an option for printing out
# cumulative hazards instead of survival --- need to add that
# so I just reprise the central code of print.summary.survfit
xx <- summary(kfit)
temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz,
xx$std.chaz, -log(xx$upper), -log(xx$lower))
dimnames(temp) <- list(rep("", nrow(temp)),
c("time", "n.risk", "n.event", "Cum haz", "std.err",
"lower 95%", "upper 95%"))
print(temp, digits=2)
# Note that I have the same estimates but different SE's than SAS. We are using
# a different estimator. It's a statistical argument as to which is
# better (one could defend both sides): SAS the more standard estimate found
# in the reliability literature, mine the estimate from statistics literature
rm(temp, kfit, xx)
######################################################
# Turbine data, lognormal fit
turbine <- read.table('data.turbine',
col.names=c("time1", "time2", "n"))
tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine,
dist='lognormal', weights=n, subset=(n>0))
summary(tfit)
# Now, do his plot, but put bootstrap confidence bands on it!
# First, make a simple data set without weights
tdata <- turbine[rep(1:nrow(turbine), turbine$n),]
qstat <- function(data) {
temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data,
dist='lognormal')
qsurvreg(plist, temp$coef, temp$scale, dist='lognormal')
}
{if (exists('bootstrap')) {
set.seed(1953) # a good year :-)
bfit <- bootstrap(tdata, qstat, B=1000)
bci <- limits.bca(bfit, probs=c(.025, .975))
}
else {
values <- matrix(0, nrow=1000, ncol=length(plist))
n <- nrow(tdata)
for (i in 1:1000) {
subset <- sample(1:n, n, replace=T)
values[i,] <- qstat(tdata[subset,])
}
bci <- t(apply(values,2, quantile, c(.05, .95)))
}
}
xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'),
bci)
matplot(xmat, qnorm(plist),
type='l', lty=c(1,2,2), col=c(1,1,1),
log='x', yaxt='n', ylab='Percent',
xlab='Time of Cracking (Hours x 100)')
axis(2, qnorm(plist), format(100*plist), adj=1)
title("Turbine Data")
kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata)
points(kfit$time, qnorm(1-kfit$surv), pch='+')
dev.off() #close the plot file