2025-01-12 04:36:52 +08:00

436 lines
17 KiB
Plaintext

R Under development (unstable) (2024-04-17 r86441) -- "Unsuffered Consequences"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: aarch64-unknown-linux-gnu
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> options(na.action=na.exclude) # preserve missings
> options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
> library(survival)
>
> #
> # Reproduce example 1 in the SAS lifereg documentation
> #
>
> # this fit doesn't give the same log-lik that they claim
> fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor,
+ subset=(temp>150), dist='lognormal')
> summary(fit1)
Call:
survreg(formula = Surv(time, status) ~ I(1000/(273.2 + temp)),
data = imotor, subset = (temp > 150), dist = "lognormal")
Value Std. Error z p
(Intercept) -10.471 2.772 -3.78 0.00016
I(1000/(273.2 + temp)) 8.322 1.284 6.48 9.1e-11
Log(scale) -0.504 0.183 -2.75 0.00596
Scale= 0.604
Log Normal distribution
Loglik(model)= -145.9 Loglik(intercept only)= -155
Chisq= 18.3 on 1 degrees of freedom, p= 1.9e-05
Number of Newton-Raphson Iterations: 6
n= 30
>
> # This one, with the loglik on the transformed scale (the inappropriate
> # scale, Ripley & Venables would argue) does agree.
> # All coefs are of course identical.
> fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor,
+ subset=(temp>150), dist='gaussian')
>
>
> # Give the quantile estimates, which is the lower half of "output 48.1.5"
> # in the SAS 9.2 manual
>
> pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
+ type='quantile', se=T)
> pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
+ type='uquantile', se=T)
> pp1
$fit
[,1] [,2] [,3]
[1,] 12033.185 26095.677 56592.20
[2,] 4536.877 9838.864 21336.98
$se.fit
[,1] [,2] [,3]
[1,] 5482.338 11359.450 26036.917
[2,] 1443.072 2901.155 7172.343
>
> temp130 <- matrix(0, nrow=3, ncol=6)
> temp130[,1] <- pp1$fit[1,]
> temp130[,2] <- pp1$se.fit[1,]
> temp130[,3] <- pp2$fit[1,]
> temp130[,4] <- pp2$se.fit[1,]
> temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,])
> temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,])
> dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"),
+ c("Time", "se(time)", "log(time)", "se[log(time)]",
+ "lower 90", "upper 90"))
> print(temp130)
Time se(time) log(time) se[log(time)] lower 90 upper 90
p=.1 12033.18 5482.338 9.395424 0.4556015 5700.089 25402.68
p=.2 26095.68 11359.450 10.169525 0.4353001 12779.950 53285.37
p=.3 56592.20 26036.917 10.943626 0.4600796 26611.422 120349.71
>
> # A set of examples, copied from the manual pages of SAS procedure
> # "reliability", which is part of their QC product.
> #
>
> color <- c("black", "red", "green", "blue", "magenta", "red4",
+ "orange", "DarkGreen", "cyan2", "DarkViolet")
> palette(color)
> pdf(file='reliability.pdf')
>
> #
> # Insulating fluids example
> #
>
> # Adding a -1 to the fit just causes the each group to have it's own
> # intercept, rather than a global intercept + constrasts. The strata
> # statement allows each to have a separate scale
> ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid)
>
> # Get predicted quantiles at each of the voltages
> # By default predict() would give a line of results for each observation,
> # I only want the unique set of x's, i.e., only 4 cases
> uvolt <- sort(unique(ifluid$voltage)) #the unique levels
> plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100
> pred <- predict(ffit, type='quantile', p=plist,
+ newdata=data.frame(voltage=uvolt))
> tfun <- function(x) log(-log(1-x))
>
> matplot(t(pred), tfun(plist), type='l', log='x', lty=1,
+ col=1:4, yaxt='n',
+ xlab="Predicted", ylab="Quantile")
> axis(2, tfun(plist), format(100*plist), adj=1)
>
> kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit
> for (i in 1:4) {
+ temp <- kfit[i]
+ points(temp$time, tfun(1-temp$surv), col=i, pch=i)
+ }
>
> # Now a table
> temp <- array(0, dim=c(4,4,4)) #4 groups by 4 parameters by 4 stats
> temp[,1,1] <- ffit$coef # "EV Location" in SAS manual
> temp[,2,1] <- ffit$scale # "EV scale"
> temp[,3,1] <- exp(ffit$coef) # "Weibull Scale"
> temp[,4,1] <- 1/ffit$scale # "Weibull Shape"
>
> temp[,1,2] <- sqrt(diag(ffit$var))[1:4] #standard error
> temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale
> temp[,3,2] <- temp[,1,2] * temp[,3,1]
> temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2
>
> temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2] #lower conf limits
> temp[,1,4] <- temp[,1,1] + 1.96*temp[,1,2] # upper
> # log(scale) is the natural parameter, in which the routine did its fitting
> # and on which the std errors were computed
> temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8])
> temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8])
>
> temp[,3, 3:4] <- exp(temp[,1,3:4])
> temp[,4, 3:4] <- 1/temp[,2,4:3]
>
> dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale",
+ "Weibull shape"),
+ c("Estimate", "SE", "lower 95% CI", "uppper 95% CI"))
> print(aperm(temp, c(2,3,1)), digits=5)
, , 26
Estimate SE lower 95% CI uppper 95% CI
EV Location 6.86249 1.10404 4.69857 9.0264
EV Scale 1.83423 0.96114 0.65677 5.1227
Weibull scale 955.74665 1055.18620 109.78973 8320.0103
Weibull shape 0.54519 0.28568 0.19521 1.5226
, , 30
Estimate SE lower 95% CI uppper 95% CI
EV Location 4.35133 0.30151 3.76037 4.9423
EV Scale 0.94446 0.22544 0.59156 1.5079
Weibull scale 77.58159 23.39176 42.96420 140.0911
Weibull shape 1.05881 0.25274 0.66318 1.6904
, , 34
Estimate SE lower 95% CI uppper 95% CI
EV Location 2.50326 0.31476 1.88632 3.1202
EV Scale 1.29732 0.22895 0.91796 1.8334
Weibull scale 12.22222 3.84707 6.59509 22.6506
Weibull shape 0.77082 0.13603 0.54542 1.0894
, , 38
Estimate SE lower 95% CI uppper 95% CI
EV Location 0.00092629 0.27318 -0.53450 0.53635
EV Scale 0.73367610 0.20380 0.42565 1.26460
Weibull scale 1.00092672 0.27343 0.58596 1.70976
Weibull shape 1.36299929 0.37861 0.79077 2.34933
>
> rm(temp, uvolt, plist, pred, ffit, kfit)
>
> #####################################################################
> # Turbine cracks data
> crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA),
+ n=c(fail, 167-sum(fail))))
> cfit <- survreg(Surv(day1, day2, type='interval2') ~1,
+ dist='weibull', data=crack2, weights=n)
>
> summary(cfit)
Call:
survreg(formula = Surv(day1, day2, type = "interval2") ~ 1, data = crack2,
weights = n, dist = "weibull")
Value Std. Error z p
(Intercept) 7.6880 0.0744 103.30 < 2e-16
Log(scale) -0.3953 0.0987 -4.01 6.2e-05
Scale= 0.674
Weibull distribution
Loglik(model)= -309.6 Loglik(intercept only)= -309.6
Number of Newton-Raphson Iterations: 5
n= 9
> #Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale)
>
> # Draw the SAS plot
> # The "type=fleming" argument reflects that they estimate hazards rather than
> # survival, and forces a Nelson-Aalen hazard estimate
> #
> plist <- c(1, 2, 5, 1:8 *10)/100
> plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x',
+ yaxt='n', type='l',
+ xlab="Weibull Plot for Time", ylab="Percent")
> axis(2, tfun(plist), format(100*plist), adj=1)
>
> kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2,
+ weight=n, type='fleming')
> # Only plot point where n.event > 0
> # Why? I'm trying to match them. Personally, all should be plotted.
> who <- (kfit$n.event > 0)
> points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+')
> points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-')
> points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-')
>
> text(rep(3,6), seq(.5, -1.0, length=6),
+ c("Scale", "Shape", "Right Censored", "Left Censored",
+ "Interval Censored", "Fit"), adj=0)
> text(rep(9,6), seq(.5, -1.0, length=6),
+ c(format(round(exp(cfit$coef), 2)),
+ format(round(1/cfit$scale, 2)),
+ format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1)
>
> # Now a portion of his percentiles table
> # I don't get the same SE as SAS, I haven't checked out why. The
> # estimates and se for the underlying Weibull model are the same.
> temp <- predict(cfit, type='quantile', p=plist, se=T)
> tempse <- sqrt(temp$se[1,])
> mat <- cbind(temp$fit[1,], tempse,
+ temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse)
> dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95"))
> print(mat)
Estimate SE Lower .95 Upper .95
1 98.47183 5.321676 88.04134 108.9023
2 157.59360 6.186246 145.46856 169.7186
5 295.17088 7.377034 280.71189 309.6299
10 479.31737 8.227489 463.19149 495.4432
20 794.55351 8.951640 777.00829 812.0987
30 1089.70373 9.404613 1071.27068 1108.1368
40 1387.95516 9.983696 1368.38712 1407.5232
50 1704.71015 10.888487 1683.36872 1726.0516
60 2057.23906 12.217230 2033.29329 2081.1848
70 2472.58593 14.035515 2445.07632 2500.0955
80 3006.43572 16.510569 2974.07501 3038.7964
>
> #
> # The cracks data has a particularly easy estimate, so use
> # it to double check code
> time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2,
+ crack2$day1[9])
> cdf <- cumsum(crack2$n)/sum(crack2$n)
> all.equal(kfit$time, time)
[1] TRUE
> all.equal(kfit$surv, 1-cdf[c(1:8,8)])
[1] TRUE
> rm(time, cdf, kfit)
>
>
> #######################################################
> #
> # Replacement of valve seats in diesel engines
> # The input data has id, time, and an indicator of whether there was an
> # event at that time: 0=no, 1=yes. No one has an event at their last time.
> # The input data has two engines with dual failures: 328 loses 2 valves at
> # time 653, and number 402 loses 2 at time 139. For each, fudge the first
> # time to be .1 days earlier.
> #
> ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0)
> temp <- valveSeat$time
> temp[ties] <- temp[ties] - .1
> n <- length(temp)
> first <- !duplicated(valveSeat$id)
> vtemp <- with(valveSeat, data.frame(id =id,
+ time1= ifelse(first, 0, c(0, temp[-n])),
+ time2= temp, status=status))
>
> kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id)
>
> plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time')
> title("Valve replacement data")
>
> # The summary.survfit function doesn't have an option for printing out
> # cumulative hazards instead of survival --- need to add that
> # so I just reprise the central code of print.summary.survfit
> xx <- summary(kfit)
> temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz,
+ xx$std.chaz, -log(xx$upper), -log(xx$lower))
> dimnames(temp) <- list(rep("", nrow(temp)),
+ c("time", "n.risk", "n.event", "Cum haz", "std.err",
+ "lower 95%", "upper 95%"))
> print(temp, digits=2)
time n.risk n.event Cum haz std.err lower 95% upper 95%
61 41 1 0.024 0.024 0.0000 0.073
76 41 1 0.049 0.034 0.0000 0.117
84 41 1 0.073 0.041 0.0000 0.156
87 41 1 0.098 0.046 0.0057 0.192
92 41 1 0.122 0.051 0.0208 0.226
98 41 1 0.146 0.055 0.0373 0.259
120 41 1 0.171 0.059 0.0548 0.291
139 41 1 0.195 0.062 0.0732 0.322
139 41 1 0.220 0.073 0.0750 0.369
165 41 1 0.244 0.075 0.0954 0.398
166 41 1 0.268 0.077 0.1163 0.427
202 41 1 0.293 0.079 0.1376 0.455
206 41 1 0.317 0.088 0.1452 0.497
249 41 1 0.341 0.089 0.1675 0.524
254 41 1 0.366 0.090 0.1903 0.551
258 41 1 0.390 0.090 0.2133 0.577
265 41 1 0.415 0.091 0.2368 0.603
276 41 1 0.439 0.098 0.2479 0.641
298 41 1 0.463 0.110 0.2490 0.689
323 41 1 0.488 0.110 0.2734 0.714
326 41 1 0.512 0.110 0.2981 0.739
328 41 1 0.537 0.115 0.3124 0.774
344 41 1 0.561 0.115 0.3376 0.798
348 41 1 0.585 0.124 0.3430 0.842
349 41 1 0.610 0.124 0.3686 0.866
367 41 1 0.634 0.123 0.3945 0.890
377 41 1 0.659 0.132 0.4018 0.932
404 40 1 0.684 0.136 0.4189 0.965
408 40 1 0.709 0.140 0.4365 0.998
410 40 1 0.734 0.139 0.4631 1.022
449 40 1 0.759 0.143 0.4813 1.055
479 40 1 0.784 0.146 0.4998 1.087
497 40 1 0.809 0.149 0.5187 1.118
538 40 1 0.834 0.152 0.5380 1.150
539 40 1 0.859 0.155 0.5576 1.181
561 40 1 0.884 0.162 0.5696 1.220
563 40 1 0.909 0.164 0.5898 1.250
570 40 1 0.934 0.170 0.6029 1.287
573 40 1 0.959 0.169 0.6310 1.310
581 38 1 0.985 0.171 0.6532 1.341
586 34 1 1.014 0.174 0.6777 1.376
604 22 1 1.060 0.185 0.7009 1.446
621 17 1 1.119 0.208 0.7144 1.554
635 16 1 1.181 0.207 0.7795 1.618
640 16 1 1.244 0.226 0.8033 1.723
646 13 1 1.320 0.229 0.8775 1.809
653 9 1 1.432 0.252 0.9394 1.983
653 9 1 1.543 0.312 0.9201 2.238
>
> # Note that I have the same estimates but different SE's than SAS. We are using
> # a different estimator. It's a statistical argument as to which is
> # better (one could defend both sides): SAS the more standard estimate found
> # in the reliability literature, mine the estimate from statistics literature
> rm(temp, kfit, xx)
>
> ######################################################
> # Turbine data, lognormal fit
> turbine <- read.table('data.turbine',
+ col.names=c("time1", "time2", "n"))
>
> tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine,
+ dist='lognormal', weights=n, subset=(n>0))
>
> summary(tfit)
Call:
survreg(formula = Surv(time1, time2, type = "interval2") ~ 1,
data = turbine, weights = n, subset = (n > 0), dist = "lognormal")
Value Std. Error z p
(Intercept) 3.6999 0.0708 52.23 <2e-16
Log(scale) -0.3287 0.1232 -2.67 0.0076
Scale= 0.72
Log Normal distribution
Loglik(model)= -190.7 Loglik(intercept only)= -190.7
Number of Newton-Raphson Iterations: 6
n= 21
>
> # Now, do his plot, but put bootstrap confidence bands on it!
> # First, make a simple data set without weights
> tdata <- turbine[rep(1:nrow(turbine), turbine$n),]
>
> qstat <- function(data) {
+ temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data,
+ dist='lognormal')
+ qsurvreg(plist, temp$coef, temp$scale, dist='lognormal')
+ }
>
> {if (exists('bootstrap')) {
+ set.seed(1953) # a good year :-)
+ bfit <- bootstrap(tdata, qstat, B=1000)
+ bci <- limits.bca(bfit, probs=c(.025, .975))
+ }
+ else {
+ values <- matrix(0, nrow=1000, ncol=length(plist))
+ n <- nrow(tdata)
+ for (i in 1:1000) {
+ subset <- sample(1:n, n, replace=T)
+ values[i,] <- qstat(tdata[subset,])
+ }
+ bci <- t(apply(values,2, quantile, c(.05, .95)))
+ }
+ }
> xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'),
+ bci)
>
>
> matplot(xmat, qnorm(plist),
+ type='l', lty=c(1,2,2), col=c(1,1,1),
+ log='x', yaxt='n', ylab='Percent',
+ xlab='Time of Cracking (Hours x 100)')
> axis(2, qnorm(plist), format(100*plist), adj=1)
> title("Turbine Data")
> kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata)
> points(kfit$time, qnorm(1-kfit$surv), pch='+')
>
> dev.off() #close the plot file
null device
1
>
>
> proc.time()
user system elapsed
1.224 0.016 1.238