2025-01-12 04:36:52 +08:00

107 lines
3.2 KiB
R

#
# This is a set of tests, with printout, that tries
# to exercise all of the splitting rules
# It's less formal than the others in this directory, but covers
# more.
# For all of the cross-validations I set xgroups explicitly, so as
# to avoid any changes in random number allocation of the groups.
library(rpart)
require(survival)
options(digits=4) #avoid trivial rounding changes across R versions
set.seed(10)
#
#
# Survival, using the stagec data
#
# Time to progression in years
# status 1=progressed, 0= censored
# age
# early endocrine therapy 1=no 2=yes
# % of cells in g2 phase, from flow cytometry
# tumor grade (Farrow) 1,2,3,4
# Gleason score (competing grading system)
# ploidy
xgroup <- rep(1:10, length=nrow(stagec))
fit1 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
stagec, method="poisson",
control=rpart.control(usesurrogate=0, cp=0, xval=xgroup))
fit1
summary(fit1)
fit2 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
stagec, xval=xgroup)
fit2
summary(fit2)
#
# Continuous y variable
# Use deterministic xgroups: the first group is the 1st, 11th, 21st, etc
# smallest obs, the second is 2, 12, 22, ...
mystate <- data.frame(state.x77, region=factor(state.region))
names(mystate) <- c("population","income" , "illiteracy","life" ,
"murder", "hs.grad", "frost", "area", "region")
xvals <- 1:nrow(mystate)
xvals[order(mystate$income)] <- rep(1:10, length=nrow(mystate))
fit4 <- rpart(income ~ population + region + illiteracy +life + murder +
hs.grad + frost , mystate,
control=rpart.control(minsplit=10, xval=xvals))
summary(fit4)
#
# Check out xpred.rpart
#
meany <- mean(mystate$income)
xpr <- xpred.rpart(fit4, xval=xvals)
xpr2 <- (xpr - mystate$income)^2
risk0 <- mean((mystate$income - meany)^2)
xpmean <- as.vector(apply(xpr2, 2, mean)) #kill the names
all.equal(xpmean/risk0, as.vector(fit4$cptable[,'xerror']))
xpstd <- as.vector(apply((sweep(xpr2, 2, xpmean))^2, 2, sum))
xpstd <- sqrt(xpstd)/(50*risk0)
all.equal(xpstd, as.vector(fit4$cptable[,'xstd']))
#
# recreate subset #3 of the xval
#
tfit4 <- rpart(income ~ population + region + illiteracy +life + murder +
hs.grad + frost , mystate, subset=(xvals!=3),
control=rpart.control(minsplit=10, xval=0))
tpred <- predict(tfit4, mystate[xvals==3,])
all.equal(tpred, xpr[xvals==3,ncol(xpr)])
# How much does this differ from the "real" formula, more complex,
# found on page 309 of Breiman et al. ?
#xtemp <- (xpr2/outer(rep(1,50),xpmean)) - ((mystate$income - meany)^2)/risk0
#real.se<- xpmean* sqrt(apply(xtemp^2,2,sum))/(risk0*50)
# Simple yes/no classification model
fit5 <- rpart(factor(pgstat) ~ age + eet + g2+grade+gleason +ploidy,
stagec, xval=xgroup)
fit5
fit6 <- rpart(factor(pgstat) ~ age + eet + g2+grade+gleason +ploidy,
stagec, parm=list(prior=c(.5,.5)), xval=xgroup)
summary(fit6)
#
# Fit a classification model to the car data.
# Now, since Reliability is an ordered category, this model doesn't
# make a lot of statistical sense, but it does test out some
# areas of the code that nothing else does
#
xcar <- rep(1:8, length=nrow(cu.summary))
carfit <- rpart(Reliability ~ Price + Country + Mileage + Type,
method='class', data=cu.summary, xval=xcar)
summary(carfit)