CircosHeatmap-aardio/dist/lib/r-library/rpart/tests/priors.R

#
# A hard-core test of losses and priors
#  Simple data set where I know what the answers must be
#
library(rpart)
aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...)

dummy <- c(3,1,4,1,5,9,2,6,5,3,5,8,9,7,9)/5
pdata <- data.frame(y=factor(rep(1:3, 5)),
                    x1 = 1:15,
                    x2 = c(1:6, 1:6, 1:3),
                    x3 = (rep(1:3, 5) + dummy)*10)

pdata$x3[c(1,5,10)] <- NA
pdata$y[15] <- 1     # make things unbalanced

set.seed(10)
pfit <- rpart(y ~ x1 + x2 + x3, pdata,
              cp=0, xval=0, minsplit=5, maxdepth=1,
              parms=list(prior=c(.2, .3, .5), 
                         loss =matrix(c(0,2,2,2,0,6,1,1,0), 3,3,byrow=T)))

#
# See section 12.1 of the report for these numbers
#

ntot   <- c(6,5,4)
phat   <- c(6,5,4)/15   # observed class probabilities
prior  <- c(.2, .3, .5) # priors
aprior <- c(4,12,5)/21  # altered priors
lmat   <- matrix(c(0,1,2, 2,0,1, 2,6,0), ncol=3)  #loss matrix

gini <- function(p) 1-sum(p^2)
loss <- function(n, class)  sum(n * lmat[,class])

phat <- function(n, ntot=c(6,5,4), prior=c(.2, .3, .5)) {
    n*prior/ntot
    }

# Are the losses correct?
# Class counts for the two children are (4,4,0) and (2,1,4), when
#   using surrogates
aeq(pfit$frame$dev/15, c(loss(prior,2), loss(phat(c(4,4,0)),2), 
                                 loss(phat(c(2,1,4)),3)))
# Node probabilities?
aeq(pfit$frame$yval2[,8] , 
    c(1, sum(phat(c(4,4,0))), sum(phat(c(2,1,4)))))

aeq(pfit$frame$yval2[,5:7] , rbind(prior, 
                                   phat(c(4,4,0))/ sum(phat(c(4,4,0))), 
                                   phat(c(2,1,4))/ sum(phat(c(2,1,4)))))

# Now the node and class probs, under altered priors
phat2 <- function(n, ntot=c(6,5,4), prior=aprior) {
    n*prior/ntot
    }

# Use these to create the gini losses, base data, and for the best
#  splits on variables 1, 2, 3
gfun <- function(n) {  #The gini loss for a node, given the counts
    temp <- phat2(n)
    sum(temp) * gini(temp/sum(temp))
    }
           
# These are in order x3, x2, x1 (best split to worst)
# Note that for x3, missing values cause the "parent" to be viewed as
#   having 12 obs instead of 15.
# Each line is gini(parent) - gini(children)
aeq(pfit$splits[1:3, 3],
    15* c(gfun(c(4,4,4)) - (gfun(c(3,4,0)) +  gfun(c(1,0,4))),
          gfun(c(6,5,4)) - (gfun(c(6,5,2)) +  gfun(c(0,0,2))),
          gfun(c(6,5,4)) - (gfun(c(4,4,4)) +  gfun(c(2,1,0)))))
首次上传 2025-01-12 00:52:51 +08:00			`#`
			`# A hard-core test of losses and priors`
			`# Simple data set where I know what the answers must be`
			`#`
			`library(rpart)`
			`aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...)`

			`dummy <- c(3,1,4,1,5,9,2,6,5,3,5,8,9,7,9)/5`
			`pdata <- data.frame(y=factor(rep(1:3, 5)),`
			`x1 = 1:15,`
			`x2 = c(1:6, 1:6, 1:3),`
			`x3 = (rep(1:3, 5) + dummy)*10)`

			`pdata$x3[c(1,5,10)] <- NA`
			`pdata$y[15] <- 1 # make things unbalanced`

			`set.seed(10)`
			`pfit <- rpart(y ~ x1 + x2 + x3, pdata,`
			`cp=0, xval=0, minsplit=5, maxdepth=1,`
			`parms=list(prior=c(.2, .3, .5),`
			`loss =matrix(c(0,2,2,2,0,6,1,1,0), 3,3,byrow=T)))`

			`#`
			`# See section 12.1 of the report for these numbers`
			`#`

			`ntot <- c(6,5,4)`
			`phat <- c(6,5,4)/15 # observed class probabilities`
			`prior <- c(.2, .3, .5) # priors`
			`aprior <- c(4,12,5)/21 # altered priors`
			`lmat <- matrix(c(0,1,2, 2,0,1, 2,6,0), ncol=3) #loss matrix`

			`gini <- function(p) 1-sum(p^2)`
			`loss <- function(n, class) sum(n * lmat[,class])`

			`phat <- function(n, ntot=c(6,5,4), prior=c(.2, .3, .5)) {`
			`n*prior/ntot`
			`}`

			`# Are the losses correct?`
			`# Class counts for the two children are (4,4,0) and (2,1,4), when`
			`# using surrogates`
			`aeq(pfit$frame$dev/15, c(loss(prior,2), loss(phat(c(4,4,0)),2),`
			`loss(phat(c(2,1,4)),3)))`
			`# Node probabilities?`
			`aeq(pfit$frame$yval2[,8] ,`
			`c(1, sum(phat(c(4,4,0))), sum(phat(c(2,1,4)))))`

			`aeq(pfit$frame$yval2[,5:7] , rbind(prior,`
			`phat(c(4,4,0))/ sum(phat(c(4,4,0))),`
			`phat(c(2,1,4))/ sum(phat(c(2,1,4)))))`

			`# Now the node and class probs, under altered priors`
			`phat2 <- function(n, ntot=c(6,5,4), prior=aprior) {`
			`n*prior/ntot`
			`}`

			`# Use these to create the gini losses, base data, and for the best`
			`# splits on variables 1, 2, 3`
			`gfun <- function(n) { #The gini loss for a node, given the counts`
			`temp <- phat2(n)`
			`sum(temp) * gini(temp/sum(temp))`
			`}`

			`# These are in order x3, x2, x1 (best split to worst)`
			`# Note that for x3, missing values cause the "parent" to be viewed as`
			`# having 12 obs instead of 15.`
			`# Each line is gini(parent) - gini(children)`
			`aeq(pfit$splits[1:3, 3],`
			`15* c(gfun(c(4,4,4)) - (gfun(c(3,4,0)) + gfun(c(1,0,4))),`
			`gfun(c(6,5,4)) - (gfun(c(6,5,2)) + gfun(c(0,0,2))),`
			`gfun(c(6,5,4)) - (gfun(c(4,4,4)) + gfun(c(2,1,0)))))`