157 lines
5.9 KiB
R
157 lines
5.9 KiB
R
options(na.action=na.exclude) # preserve missings
|
|
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
|
|
library(survival)
|
|
|
|
#
|
|
# The test data set from Turnbull, JASA 1974, 169-73.
|
|
#
|
|
# status 0=right censored
|
|
# 1=exact
|
|
# 2=left censored
|
|
#
|
|
aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...)
|
|
|
|
|
|
turnbull <- data.frame( time =c( 1,1,1, 2,2,2, 3,3,3, 4,4,4),
|
|
status=c( 1,0,2, 1,0,2, 1,0,2, 1,0,2),
|
|
n =c(12,3,2, 6,2,4, 2,0,2, 3,3,5))
|
|
#
|
|
# Compute the K-M for the Turnbull data
|
|
# via a slow EM calculation
|
|
#
|
|
|
|
emsurv <- function(time, status, wt, verbose=T) {
|
|
left.cen <- (status==2)
|
|
if (!any(left.cen)) stop("No left censored data!")
|
|
if (!any(status==1))stop("Must have some exact death times")
|
|
|
|
tempy <- Surv(time[!left.cen], status[!left.cen])
|
|
ww <- wt[!left.cen]
|
|
tempx <- factor(rep(1, sum(!left.cen)))
|
|
tfit <- survfit(tempy~tempx, weights=ww)
|
|
if (verbose)
|
|
cat("Iteration 0, survival=", format(round(tfit$surv[tfit$n.event>0],3)),
|
|
"\n")
|
|
|
|
stimes <- tfit$time[tfit$n.event>0]
|
|
ltime <- time[left.cen]
|
|
lwt <- wt[left.cen]
|
|
tempx <- factor(rep(1, length(stimes) + sum(!left.cen)))
|
|
tempy <- Surv(c(time[!left.cen], stimes),
|
|
c(status[!left.cen], rep(1, length(stimes))))
|
|
for (iter in 1:4) {
|
|
wt2 <- stimes*0
|
|
ssurv <- tfit$surv[tfit$n.event>0]
|
|
sjump <- diff(c(1, ssurv))
|
|
for (j in 1:(length(ltime))) {
|
|
k <- sum(ltime[j]>=stimes) #index of the death time
|
|
if (k==0)
|
|
stop("Left censored observation before the first death")
|
|
wt2[1:k] <- wt2[1:k] + lwt[j]*sjump[1:k] /(ssurv[k]-1)
|
|
}
|
|
tfit <- survfit(tempy~tempx, weights=c(ww, wt2))
|
|
if (verbose) {
|
|
cat("Iteration", iter, "survival=",
|
|
format(round(tfit$surv[tfit$n.event>0],3)), "\n")
|
|
cat(" weights=", format(round(wt2,3)), "\n")
|
|
}
|
|
}
|
|
survfit(tempy ~ tempx, weights=c(ww, wt2), robust=FALSE)
|
|
}
|
|
|
|
temp <-emsurv(turnbull$time, turnbull$status, turnbull$n)
|
|
print(summary(temp))
|
|
# First check, use the data from Turnbull, JASA 1974, 169-173.
|
|
|
|
tdata <- data.frame(time =c(1,1,1,2,2,2,3,3,3,4,4,4),
|
|
status=rep(c(1,0,2),4),
|
|
n =c(12,3,2,6,2,4,2,0,2,3,3,5))
|
|
|
|
tfit <- survfit(Surv(time, time, status, type='interval') ~1, tdata, weights=n)
|
|
all.equal(round(tfit$surv,3), c(.538, .295, .210, .095))
|
|
|
|
|
|
# Second check, compare to a reversed survival curve
|
|
# This is not as simple a test as one might think, because left and right
|
|
# censored observations are not treated symmetrically by the routine:
|
|
# time <= y for left and time> y for right (this is to make the routine
|
|
# correct for the common situation of panel data).
|
|
# To get equivalence, make the left censoreds happen just a little bit
|
|
# earlier. The left-continuous/right-continuous shift is also a bother.
|
|
#
|
|
test1 <- data.frame(time= c(9, 3,1,1,6,6,8),
|
|
status=c(1,NA,1,0,1,1,0),
|
|
x= c(0, 2,1,1,1,0,0))
|
|
fit1 <- survfit(Surv(time, status) ~1, test1)
|
|
temp <- ifelse(test1$status==0, 4.99,5) - test1$time
|
|
fit2 <- survfit(Surv(temp, status, type='left') ~1, test1)
|
|
|
|
all.equal(round(fit1$surv[1:2],5), round(1-fit2$surv[3:2],5))
|
|
|
|
rm(tdata, tfit, fit1, temp, fit2)
|
|
#
|
|
# Create a data set similar to the one provided by Al Zinsmeister
|
|
# It is a hard test case for survfit.turnbull
|
|
#
|
|
time1 <- c(rep(0,100), rep(1,200), 100, 200, 210, 220,
|
|
rep(365,100), rep(366,5), 731:741)
|
|
|
|
time2 <- c((1:100)*3, 10+1:100, rep(365:366, c(60,40)), NA, 500, NA, 450,
|
|
rep(730,90), rep(NA,10), c(528,571,691,730,731),
|
|
NA, 1095:1099, NA, 1400, 1200, 772, 1461)
|
|
|
|
zfit <- survfit(Surv(time1, time2, type='interval2') ~1)
|
|
|
|
#
|
|
# There are 100 intervals of the form (0,x) where x is from 3 to 300,
|
|
# and 200 more of the form (1,x) where x is from 11 to 366. These
|
|
# lead to a mass point in the interval (1,3), which is placed at 2.
|
|
# The starting estimate has far too little mass placed here, and it takes
|
|
# the EM a long time to realize that most of the weight for the first 300
|
|
# subjects goes here. With acceleration, it takes 16 iterations, without
|
|
# it takes >40. (On Al's orginal data, without accel still wasn't there after
|
|
# 165 iters!)
|
|
#
|
|
# The next 4 obs give rise to potential jumps at 100.5, 200.5, 211.5, and
|
|
# 221. However, the final estimate has no mass at all on any of these.
|
|
# Assume mass of a,b, and c at 2, 100.5 and 365.5, and consider the
|
|
# contributions:
|
|
# 123 obs that overlap a only
|
|
# 137 obs that overlap a and b
|
|
# 40 obs that overlap a, b, c
|
|
# 1 obs that overlap b, c
|
|
# 108 obs that overlap c (200, 210,200, 365, and 366 starting points)
|
|
# For some trial values of a,b,c, compare the loglik to that of (a+b),0,c
|
|
# First one: a^123 (a+b)^137 (a+b+c)^40 (b+c) c^108
|
|
# Second: (a+b)^123 (a+b)^137 (a+b+c)^40 c c^108
|
|
# Likelhood improves if (1 + b/a)^123 > 1+ b/c, which is true for almost
|
|
# all a and c. In particular, at the solution a and c are approx .7 and
|
|
# .18, respectively.
|
|
#
|
|
# The program can't see this coming, of course, and so iterates towards a
|
|
# KM with epsilon sized jumps at 100.5, 200.5, and 211.5. Whether these
|
|
# intervals should be removed during iteration, as detected, is an open
|
|
# question for me.
|
|
#
|
|
#
|
|
# True solution: mass points at 2, 365.5, 408, and 756.5, of sizes a, b, c, d
|
|
# Likelihood: a^260 (a+b)^40 (b+c)^92 (b+c+d)^12 c^5 d^11
|
|
# Solution: a=0.6958, b=0.1674, c=0.1079, d=0.0289
|
|
|
|
tfun <- function(x) {
|
|
if (length(x) ==3) x <- c(x, .03)
|
|
x <- x/sum(x) #make probabilities sum to 1
|
|
loglik <- 260*log(x[1]) + 40*log(x[1]+x[2]) + 92*log(x[2] + x[3]) +
|
|
12*log(x[2]+x[3]+x[4]) + 5*log(x[3]) + 11*log(x[4])
|
|
-loglik #find the max, not the min
|
|
}
|
|
|
|
nfit <- nlminb(start=c(.7,.15, .1), tfun, lower=0, upper=1)
|
|
nparm <- c(nfit$par, .03)
|
|
nparm <- nparm / sum(nparm)
|
|
zparm <- -diff(c(1, zfit$surv[match(c(2, 365.5, 408, 756.5), zfit$time)]))
|
|
aeq(round(tfun(nparm),4), round(tfun(zparm),4))
|
|
# .0001 is the tolerance in survfit.turnbull
|
|
|
|
rm(tfun, nfit, nparm, zparm, time1, time2, zfit)
|