61 lines
2.5 KiB
R
61 lines
2.5 KiB
R
|
#
|
||
|
# Test weights in a regression problem
|
||
|
#
|
||
|
library(rpart)
|
||
|
set.seed(10)
|
||
|
|
||
|
mystate <- data.frame(state.x77, region=factor(state.region))
|
||
|
names(mystate) <- c("population","income" , "illiteracy","life" ,
|
||
|
"murder", "hs.grad", "frost", "area", "region")
|
||
|
|
||
|
xgrp <- rep(1:10,5)
|
||
|
fit4 <- rpart(income ~ population + region + illiteracy +life + murder +
|
||
|
hs.grad + frost , mystate,
|
||
|
control=rpart.control(minsplit=10, xval=xgrp))
|
||
|
wts <- rep(3, nrow(mystate))
|
||
|
fit4b <- rpart(income ~ population + region + illiteracy +life + murder +
|
||
|
hs.grad + frost , mystate,
|
||
|
control=rpart.control(minsplit=10, xval=xgrp), weights=wts)
|
||
|
fit4b$frame$wt <- fit4b$frame$wt/3
|
||
|
fit4b$frame$dev <- fit4b$frame$dev/3
|
||
|
fit4b$cptable[,5] <- fit4b$cptable[,5] * sqrt(3)
|
||
|
temp <- c('frame', 'where', 'splits', 'csplit', 'cptable')
|
||
|
all.equal(fit4[temp], fit4b[temp])
|
||
|
|
||
|
|
||
|
# Next is a very simple case, but worth keeping
|
||
|
dummy <- data.frame(y=1:10, x1=c(10:4, 1:3), x2=c(1,3,5,7,9,2,4,6,8,0))
|
||
|
|
||
|
xx1 <- rpart(y ~ x1 + x2, dummy, minsplit=4, xval=0)
|
||
|
xx2 <- rpart(y ~ x1 + x2, dummy, weights=rep(2,10), minsplit=4, xval=0)
|
||
|
|
||
|
all.equal(xx1$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2))
|
||
|
all.equal(xx2$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)*2)
|
||
|
|
||
|
# Now for a set of non-equal weights
|
||
|
# We need to set maxcompete=3 because there just happens to be, in one
|
||
|
# of the lower nodes, an exact tie between variables "life" and "murder".
|
||
|
# Round off error causes fit5 to choose one and fit5b the other.
|
||
|
# Later -- cut it back to maxdepth=3 for the same reason (a tie).
|
||
|
#
|
||
|
nn <- nrow(mystate)
|
||
|
wts <- rep(1:5, length=nn)
|
||
|
temp <- rep(1:nn, wts) #row replicates
|
||
|
xgrp <- rep(1:10, length=nn)
|
||
|
xgrp2<- rep(xgrp, wts)
|
||
|
tempc <- rpart.control(minsplit=2, xval=xgrp2, maxsurrogate=0,
|
||
|
maxcompete=3, maxdepth=3)
|
||
|
# Direct: replicate rows in the data set, and use unweighted
|
||
|
fit5 <- rpart(income ~ population + region + illiteracy +life + murder +
|
||
|
hs.grad + frost , data=mystate[temp,], control=tempc)
|
||
|
# Weighted
|
||
|
tempc <- rpart.control(minsplit=2, xval=xgrp, maxsurrogate=0,
|
||
|
maxcompete=3, maxdepth=3)
|
||
|
fit5b <- rpart(income ~ population + region + illiteracy +life + murder +
|
||
|
hs.grad + frost , data=mystate, control=tempc,
|
||
|
weights=wts)
|
||
|
all.equal(fit5$frame[-2], fit5b$frame[-2]) # the "n" component won't match
|
||
|
all.equal(fit5$cptable, fit5b$cptable)
|
||
|
all.equal(fit5$splits[,-1],fit5b$splits[,-1])
|
||
|
all.equal(fit5$csplit, fit5b$csplit)
|