98 lines
3.3 KiB
Plaintext
98 lines
3.3 KiB
Plaintext
|
|
R Under development (unstable) (2019-04-05 r76323) -- "Unsuffered Consequences"
|
|
Copyright (C) 2019 The R Foundation for Statistical Computing
|
|
Platform: x86_64-pc-linux-gnu (64-bit)
|
|
|
|
R is free software and comes with ABSOLUTELY NO WARRANTY.
|
|
You are welcome to redistribute it under certain conditions.
|
|
Type 'license()' or 'licence()' for distribution details.
|
|
|
|
R is a collaborative project with many contributors.
|
|
Type 'contributors()' for more information and
|
|
'citation()' on how to cite R or R packages in publications.
|
|
|
|
Type 'demo()' for some demos, 'help()' for on-line help, or
|
|
'help.start()' for an HTML browser interface to help.
|
|
Type 'q()' to quit R.
|
|
|
|
> #
|
|
> # A hard-core test of losses and priors
|
|
> # Simple data set where I know what the answers must be
|
|
> #
|
|
> library(rpart)
|
|
> aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...)
|
|
>
|
|
> dummy <- c(3,1,4,1,5,9,2,6,5,3,5,8,9,7,9)/5
|
|
> pdata <- data.frame(y=factor(rep(1:3, 5)),
|
|
+ x1 = 1:15,
|
|
+ x2 = c(1:6, 1:6, 1:3),
|
|
+ x3 = (rep(1:3, 5) + dummy)*10)
|
|
>
|
|
> pdata$x3[c(1,5,10)] <- NA
|
|
> pdata$y[15] <- 1 # make things unbalanced
|
|
>
|
|
> set.seed(10)
|
|
> pfit <- rpart(y ~ x1 + x2 + x3, pdata,
|
|
+ cp=0, xval=0, minsplit=5, maxdepth=1,
|
|
+ parms=list(prior=c(.2, .3, .5),
|
|
+ loss =matrix(c(0,2,2,2,0,6,1,1,0), 3,3,byrow=T)))
|
|
>
|
|
> #
|
|
> # See section 12.1 of the report for these numbers
|
|
> #
|
|
>
|
|
> ntot <- c(6,5,4)
|
|
> phat <- c(6,5,4)/15 # observed class probabilities
|
|
> prior <- c(.2, .3, .5) # priors
|
|
> aprior <- c(4,12,5)/21 # altered priors
|
|
> lmat <- matrix(c(0,1,2, 2,0,1, 2,6,0), ncol=3) #loss matrix
|
|
>
|
|
> gini <- function(p) 1-sum(p^2)
|
|
> loss <- function(n, class) sum(n * lmat[,class])
|
|
>
|
|
> phat <- function(n, ntot=c(6,5,4), prior=c(.2, .3, .5)) {
|
|
+ n*prior/ntot
|
|
+ }
|
|
>
|
|
> # Are the losses correct?
|
|
> # Class counts for the two children are (4,4,0) and (2,1,4), when
|
|
> # using surrogates
|
|
> aeq(pfit$frame$dev/15, c(loss(prior,2), loss(phat(c(4,4,0)),2),
|
|
+ loss(phat(c(2,1,4)),3)))
|
|
[1] TRUE
|
|
> # Node probabilities?
|
|
> aeq(pfit$frame$yval2[,8] ,
|
|
+ c(1, sum(phat(c(4,4,0))), sum(phat(c(2,1,4)))))
|
|
[1] TRUE
|
|
>
|
|
> aeq(pfit$frame$yval2[,5:7] , rbind(prior,
|
|
+ phat(c(4,4,0))/ sum(phat(c(4,4,0))),
|
|
+ phat(c(2,1,4))/ sum(phat(c(2,1,4)))))
|
|
[1] TRUE
|
|
>
|
|
> # Now the node and class probs, under altered priors
|
|
> phat2 <- function(n, ntot=c(6,5,4), prior=aprior) {
|
|
+ n*prior/ntot
|
|
+ }
|
|
>
|
|
> # Use these to create the gini losses, base data, and for the best
|
|
> # splits on variables 1, 2, 3
|
|
> gfun <- function(n) { #The gini loss for a node, given the counts
|
|
+ temp <- phat2(n)
|
|
+ sum(temp) * gini(temp/sum(temp))
|
|
+ }
|
|
>
|
|
> # These are in order x3, x2, x1 (best split to worst)
|
|
> # Note that for x3, missing values cause the "parent" to be viewed as
|
|
> # having 12 obs instead of 15.
|
|
> # Each line is gini(parent) - gini(children)
|
|
> aeq(pfit$splits[1:3, 3],
|
|
+ 15* c(gfun(c(4,4,4)) - (gfun(c(3,4,0)) + gfun(c(1,0,4))),
|
|
+ gfun(c(6,5,4)) - (gfun(c(6,5,2)) + gfun(c(0,0,2))),
|
|
+ gfun(c(6,5,4)) - (gfun(c(4,4,4)) + gfun(c(2,1,0)))))
|
|
[1] TRUE
|
|
>
|
|
> proc.time()
|
|
user system elapsed
|
|
0.129 0.012 0.135
|