257 lines
9.5 KiB
R
257 lines
9.5 KiB
R
options(na.action=na.exclude) # preserve missings
|
|
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
|
|
library(survival)
|
|
|
|
#
|
|
# Reproduce example 1 in the SAS lifereg documentation
|
|
#
|
|
|
|
# this fit doesn't give the same log-lik that they claim
|
|
fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor,
|
|
subset=(temp>150), dist='lognormal')
|
|
summary(fit1)
|
|
|
|
# This one, with the loglik on the transformed scale (the inappropriate
|
|
# scale, Ripley & Venables would argue) does agree.
|
|
# All coefs are of course identical.
|
|
fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor,
|
|
subset=(temp>150), dist='gaussian')
|
|
|
|
|
|
# Give the quantile estimates, which is the lower half of "output 48.1.5"
|
|
# in the SAS 9.2 manual
|
|
|
|
pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
|
|
type='quantile', se=T)
|
|
pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
|
|
type='uquantile', se=T)
|
|
pp1
|
|
|
|
temp130 <- matrix(0, nrow=3, ncol=6)
|
|
temp130[,1] <- pp1$fit[1,]
|
|
temp130[,2] <- pp1$se.fit[1,]
|
|
temp130[,3] <- pp2$fit[1,]
|
|
temp130[,4] <- pp2$se.fit[1,]
|
|
temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,])
|
|
temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,])
|
|
dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"),
|
|
c("Time", "se(time)", "log(time)", "se[log(time)]",
|
|
"lower 90", "upper 90"))
|
|
print(temp130)
|
|
|
|
# A set of examples, copied from the manual pages of SAS procedure
|
|
# "reliability", which is part of their QC product.
|
|
#
|
|
|
|
color <- c("black", "red", "green", "blue", "magenta", "red4",
|
|
"orange", "DarkGreen", "cyan2", "DarkViolet")
|
|
palette(color)
|
|
pdf(file='reliability.pdf')
|
|
|
|
#
|
|
# Insulating fluids example
|
|
#
|
|
|
|
# Adding a -1 to the fit just causes the each group to have it's own
|
|
# intercept, rather than a global intercept + constrasts. The strata
|
|
# statement allows each to have a separate scale
|
|
ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid)
|
|
|
|
# Get predicted quantiles at each of the voltages
|
|
# By default predict() would give a line of results for each observation,
|
|
# I only want the unique set of x's, i.e., only 4 cases
|
|
uvolt <- sort(unique(ifluid$voltage)) #the unique levels
|
|
plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100
|
|
pred <- predict(ffit, type='quantile', p=plist,
|
|
newdata=data.frame(voltage=uvolt))
|
|
tfun <- function(x) log(-log(1-x))
|
|
|
|
matplot(t(pred), tfun(plist), type='l', log='x', lty=1,
|
|
col=1:4, yaxt='n',
|
|
xlab="Predicted", ylab="Quantile")
|
|
axis(2, tfun(plist), format(100*plist), adj=1)
|
|
|
|
kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit
|
|
for (i in 1:4) {
|
|
temp <- kfit[i]
|
|
points(temp$time, tfun(1-temp$surv), col=i, pch=i)
|
|
}
|
|
|
|
# Now a table
|
|
temp <- array(0, dim=c(4,4,4)) #4 groups by 4 parameters by 4 stats
|
|
temp[,1,1] <- ffit$coef # "EV Location" in SAS manual
|
|
temp[,2,1] <- ffit$scale # "EV scale"
|
|
temp[,3,1] <- exp(ffit$coef) # "Weibull Scale"
|
|
temp[,4,1] <- 1/ffit$scale # "Weibull Shape"
|
|
|
|
temp[,1,2] <- sqrt(diag(ffit$var))[1:4] #standard error
|
|
temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale
|
|
temp[,3,2] <- temp[,1,2] * temp[,3,1]
|
|
temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2
|
|
|
|
temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2] #lower conf limits
|
|
temp[,1,4] <- temp[,1,1] + 1.96*temp[,1,2] # upper
|
|
# log(scale) is the natural parameter, in which the routine did its fitting
|
|
# and on which the std errors were computed
|
|
temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8])
|
|
temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8])
|
|
|
|
temp[,3, 3:4] <- exp(temp[,1,3:4])
|
|
temp[,4, 3:4] <- 1/temp[,2,4:3]
|
|
|
|
dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale",
|
|
"Weibull shape"),
|
|
c("Estimate", "SE", "lower 95% CI", "uppper 95% CI"))
|
|
print(aperm(temp, c(2,3,1)), digits=5)
|
|
|
|
rm(temp, uvolt, plist, pred, ffit, kfit)
|
|
|
|
#####################################################################
|
|
# Turbine cracks data
|
|
crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA),
|
|
n=c(fail, 167-sum(fail))))
|
|
cfit <- survreg(Surv(day1, day2, type='interval2') ~1,
|
|
dist='weibull', data=crack2, weights=n)
|
|
|
|
summary(cfit)
|
|
#Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale)
|
|
|
|
# Draw the SAS plot
|
|
# The "type=fleming" argument reflects that they estimate hazards rather than
|
|
# survival, and forces a Nelson-Aalen hazard estimate
|
|
#
|
|
plist <- c(1, 2, 5, 1:8 *10)/100
|
|
plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x',
|
|
yaxt='n', type='l',
|
|
xlab="Weibull Plot for Time", ylab="Percent")
|
|
axis(2, tfun(plist), format(100*plist), adj=1)
|
|
|
|
kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2,
|
|
weight=n, type='fleming')
|
|
# Only plot point where n.event > 0
|
|
# Why? I'm trying to match them. Personally, all should be plotted.
|
|
who <- (kfit$n.event > 0)
|
|
points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+')
|
|
points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-')
|
|
points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-')
|
|
|
|
text(rep(3,6), seq(.5, -1.0, length=6),
|
|
c("Scale", "Shape", "Right Censored", "Left Censored",
|
|
"Interval Censored", "Fit"), adj=0)
|
|
text(rep(9,6), seq(.5, -1.0, length=6),
|
|
c(format(round(exp(cfit$coef), 2)),
|
|
format(round(1/cfit$scale, 2)),
|
|
format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1)
|
|
|
|
# Now a portion of his percentiles table
|
|
# I don't get the same SE as SAS, I haven't checked out why. The
|
|
# estimates and se for the underlying Weibull model are the same.
|
|
temp <- predict(cfit, type='quantile', p=plist, se=T)
|
|
tempse <- sqrt(temp$se[1,])
|
|
mat <- cbind(temp$fit[1,], tempse,
|
|
temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse)
|
|
dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95"))
|
|
print(mat)
|
|
|
|
#
|
|
# The cracks data has a particularly easy estimate, so use
|
|
# it to double check code
|
|
time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2,
|
|
crack2$day1[9])
|
|
cdf <- cumsum(crack2$n)/sum(crack2$n)
|
|
all.equal(kfit$time, time)
|
|
all.equal(kfit$surv, 1-cdf[c(1:8,8)])
|
|
rm(time, cdf, kfit)
|
|
|
|
|
|
#######################################################
|
|
#
|
|
# Replacement of valve seats in diesel engines
|
|
# The input data has id, time, and an indicator of whether there was an
|
|
# event at that time: 0=no, 1=yes. No one has an event at their last time.
|
|
# The input data has two engines with dual failures: 328 loses 2 valves at
|
|
# time 653, and number 402 loses 2 at time 139. For each, fudge the first
|
|
# time to be .1 days earlier.
|
|
#
|
|
ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0)
|
|
temp <- valveSeat$time
|
|
temp[ties] <- temp[ties] - .1
|
|
n <- length(temp)
|
|
first <- !duplicated(valveSeat$id)
|
|
vtemp <- with(valveSeat, data.frame(id =id,
|
|
time1= ifelse(first, 0, c(0, temp[-n])),
|
|
time2= temp, status=status))
|
|
|
|
kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id)
|
|
|
|
plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time')
|
|
title("Valve replacement data")
|
|
|
|
# The summary.survfit function doesn't have an option for printing out
|
|
# cumulative hazards instead of survival --- need to add that
|
|
# so I just reprise the central code of print.summary.survfit
|
|
xx <- summary(kfit)
|
|
temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz,
|
|
xx$std.chaz, -log(xx$upper), -log(xx$lower))
|
|
dimnames(temp) <- list(rep("", nrow(temp)),
|
|
c("time", "n.risk", "n.event", "Cum haz", "std.err",
|
|
"lower 95%", "upper 95%"))
|
|
print(temp, digits=2)
|
|
|
|
# Note that I have the same estimates but different SE's than SAS. We are using
|
|
# a different estimator. It's a statistical argument as to which is
|
|
# better (one could defend both sides): SAS the more standard estimate found
|
|
# in the reliability literature, mine the estimate from statistics literature
|
|
rm(temp, kfit, xx)
|
|
|
|
######################################################
|
|
# Turbine data, lognormal fit
|
|
turbine <- read.table('data.turbine',
|
|
col.names=c("time1", "time2", "n"))
|
|
|
|
tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine,
|
|
dist='lognormal', weights=n, subset=(n>0))
|
|
|
|
summary(tfit)
|
|
|
|
# Now, do his plot, but put bootstrap confidence bands on it!
|
|
# First, make a simple data set without weights
|
|
tdata <- turbine[rep(1:nrow(turbine), turbine$n),]
|
|
|
|
qstat <- function(data) {
|
|
temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data,
|
|
dist='lognormal')
|
|
qsurvreg(plist, temp$coef, temp$scale, dist='lognormal')
|
|
}
|
|
|
|
{if (exists('bootstrap')) {
|
|
set.seed(1953) # a good year :-)
|
|
bfit <- bootstrap(tdata, qstat, B=1000)
|
|
bci <- limits.bca(bfit, probs=c(.025, .975))
|
|
}
|
|
else {
|
|
values <- matrix(0, nrow=1000, ncol=length(plist))
|
|
n <- nrow(tdata)
|
|
for (i in 1:1000) {
|
|
subset <- sample(1:n, n, replace=T)
|
|
values[i,] <- qstat(tdata[subset,])
|
|
}
|
|
bci <- t(apply(values,2, quantile, c(.05, .95)))
|
|
}
|
|
}
|
|
xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'),
|
|
bci)
|
|
|
|
|
|
matplot(xmat, qnorm(plist),
|
|
type='l', lty=c(1,2,2), col=c(1,1,1),
|
|
log='x', yaxt='n', ylab='Percent',
|
|
xlab='Time of Cracking (Hours x 100)')
|
|
axis(2, qnorm(plist), format(100*plist), adj=1)
|
|
title("Turbine Data")
|
|
kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata)
|
|
points(kfit$time, qnorm(1-kfit$surv), pch='+')
|
|
|
|
dev.off() #close the plot file
|
|
|