R Under development (unstable) (2019-04-05 r76323) -- "Unsuffered Consequences" Copyright (C) 2019 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # > # Test weights in a regression problem > # > library(rpart) > set.seed(10) > > mystate <- data.frame(state.x77, region=factor(state.region)) > names(mystate) <- c("population","income" , "illiteracy","life" , + "murder", "hs.grad", "frost", "area", "region") > > xgrp <- rep(1:10,5) > fit4 <- rpart(income ~ population + region + illiteracy +life + murder + + hs.grad + frost , mystate, + control=rpart.control(minsplit=10, xval=xgrp)) > wts <- rep(3, nrow(mystate)) > fit4b <- rpart(income ~ population + region + illiteracy +life + murder + + hs.grad + frost , mystate, + control=rpart.control(minsplit=10, xval=xgrp), weights=wts) > fit4b$frame$wt <- fit4b$frame$wt/3 > fit4b$frame$dev <- fit4b$frame$dev/3 > fit4b$cptable[,5] <- fit4b$cptable[,5] * sqrt(3) > temp <- c('frame', 'where', 'splits', 'csplit', 'cptable') > all.equal(fit4[temp], fit4b[temp]) [1] TRUE > > > # Next is a very simple case, but worth keeping > dummy <- data.frame(y=1:10, x1=c(10:4, 1:3), x2=c(1,3,5,7,9,2,4,6,8,0)) > > xx1 <- rpart(y ~ x1 + x2, dummy, minsplit=4, xval=0) > xx2 <- rpart(y ~ x1 + x2, dummy, weights=rep(2,10), minsplit=4, xval=0) > > all.equal(xx1$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)) [1] TRUE > all.equal(xx2$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)*2) [1] TRUE > > # Now for a set of non-equal weights > # We need to set maxcompete=3 because there just happens to be, in one > # of the lower nodes, an exact tie between variables "life" and "murder". > # Round off error causes fit5 to choose one and fit5b the other. > # Later -- cut it back to maxdepth=3 for the same reason (a tie). > # > nn <- nrow(mystate) > wts <- rep(1:5, length=nn) > temp <- rep(1:nn, wts) #row replicates > xgrp <- rep(1:10, length=nn) > xgrp2<- rep(xgrp, wts) > tempc <- rpart.control(minsplit=2, xval=xgrp2, maxsurrogate=0, + maxcompete=3, maxdepth=3) > # Direct: replicate rows in the data set, and use unweighted > fit5 <- rpart(income ~ population + region + illiteracy +life + murder + + hs.grad + frost , data=mystate[temp,], control=tempc) > # Weighted > tempc <- rpart.control(minsplit=2, xval=xgrp, maxsurrogate=0, + maxcompete=3, maxdepth=3) > fit5b <- rpart(income ~ population + region + illiteracy +life + murder + + hs.grad + frost , data=mystate, control=tempc, + weights=wts) > all.equal(fit5$frame[-2], fit5b$frame[-2]) # the "n" component won't match [1] TRUE > all.equal(fit5$cptable, fit5b$cptable) [1] TRUE > all.equal(fit5$splits[,-1],fit5b$splits[,-1]) [1] TRUE > all.equal(fit5$csplit, fit5b$csplit) [1] TRUE > > proc.time() user system elapsed 0.102 0.031 0.128