246 lines
10 KiB
R
Raw Permalink Normal View History

2025-01-12 00:52:51 +08:00
#### lm, glm, aov, etc --- typically *strict* tests (no *.Rout.save)
options(warn = 2, width = 101) # all warnings must be asserted below
data(mtcars)
mtcar2 <- within(mtcars, {
mpg_c <- mpg * (1+am) + 5
am <- factor(am)
})
fm2 <- glm(disp ~ am * mpg + mpg_c, data = mtcar2)
c2 <- coef(fm2)
V2 <- vcov(fm2)
jj <- !is.na(c2)
stopifnot(names(which(!jj)) == "am1:mpg"
, identical(length(c2), 5L), identical(dim(V2), c(5L,5L))
, all.equal(c2[jj], coef(fm2, complete=FALSE))
, all.equal(V2[jj,jj], vcov(fm2, complete=FALSE))
, all.equal(c2[jj], c(`(Intercept)`= 626.0915, am1 = -249.4183,
mpg = -33.74701, mpg_c = 10.97014),
tol = 7e-7)# 1.01e-7 [F26 Lnx 64b]
)
### predict.lm(<rank-deficient>, newdata = *) -- PR#15072, PR#16158 --------------
## constructed "exactly" rank-deficient
x1 <- -4:4
x2 <- c(-2,1,-1,2, 0,
2,-1,1,-2)
x3 <- 3*x1 - 2*x2
x4 <- x2 - x1 + 4
y <- 1 + x1 + x2 + x3 + x4 + c(-.5,.5,.5,-.5, 0,
.5,-.5,-.5,.5)
cbind(x1,x2,x3,x4,y)
## Fit a model
mod1234 <- lm(y ~ x1 + x2 + x3 + x4)
if(requireNamespace("MASS")) {
(al <- alias(mod1234)) # x3: 3*x1 - 2*x2 \\ x4 : 4 -x1 +x2
stopifnot(all.equal(rbind(x3 = c(0, 3,-2),
x4 = c(4, -1, 1)),
unclass(al$Complete), check.attributes=FALSE))
}
## new.x
new.x <- data.frame(
row.names = LETTERS[1:6],
x1 = c(3, 6, 6, 0, 0, 1),
x2 = c(1, 2, 2, 0, 0, 2),
x3 = c(7,14,14, 0, 0, 3),
x4 = c(2, 4, 0, 4, 0, 4))
## where do we have the same aliasing subspace?
new.ok <- with(new.x, (x4 == 4 - x1 + x2) &
(x3 == 3*x1 - 2*x2))
which(new.ok) # 1 3 4
## old (hard-wired) R <= 4.2.x behavior:
tools::assertWarning(ps <- predict(mod1234, newdata=new.x, rankdeficient = "simple"), verbose=TRUE)
ps1 <- predict(mod1234, newdata=data.frame(x1,x2,x3,x4), rankdeficient = "warnif") # *not* warning anymore
## new
(pN <- predict(mod1234, new.x, rankdeficient = "NA"))
tools::assertWarning(pN.<- predict(mod1234, new.x, rankdeficient = "NAwarn"))
## "compromise": old predictions with extra info (no warning):
(pne <- predict(mod1234, new.x, rankdeficient = "non-estim"))
stopifnot(exprs = {
identical(pN, pN.)
all.equal(fitted(mod1234), ps1, tol = 2e-15) # seen 3.11e-16
identical(i.ne <- attr(pne, "non-estim"),
c(B = 2L, E = 5L, F = 6L))
which(!new.ok) == i.ne
is.na(pN[i.ne])
identical(ps[-i.ne], pN[-i.ne])
identical(unname(ps), `attributes<-`(pne, NULL))
})
d8 <- data.frame(
y = c(747625803, -74936705, -750056726, -299805697,
76131520, -225971209, 301836031, 2249594776, 300581863, -2999324198,
450274906, -600962167, 1800954652, 900083298, -1498452810),
X1 = c(149999999, -225000002, -149999999, 149999998, 225000002,
-675000006, -149999998, 449999997, 900000008, -599999996,
1350000012, 299999996, -899999988, -449999994, -299999998),
X2 = c(300000000.5, -149999999, -300000000.5, 1, 149999999,
-449999997, -1, 900000001.5, 599999996, -1200000002,
899999994, 2, -6, -3, -600000001),
X3 = c(-1, 149999998, 1, -150000002, -149999998,
449999994, 150000002, -3, -599999992, 4,
-899999988, -300000004, 900000012, 450000006, 2))
coef(fm8. <- lm(y ~ . -1, data = d8)) # the one for X3 is NA
cf8. <- c(X1 = -1.999854802642, X2 = 3.499496934397, X3 = NA)
all.equal(cf8., coef(fm8.), tol=0)# -> "Mean rel..diff.: ~ 3e-15
stopifnot(all.equal(cf8., coef(fm8.)))
coef(fm8.9 <- lm(y ~ . -1, data = d8, tol = 1e-9)) # no NA , but "instable" -- not too precise
cf8.9 <- c(X1 = 45822.830422, X2 = -22908.915871, X3 = 45824.830295)
all.equal(cf8.9, coef(fm8.9), tol=0)# -> "Mean rel..diff.: 5.3e-9 | 5.15e-12
## was < 2e-8 in R 4.2.2
## x86_64 Linux/gcc12 gives ca 5e-12
## vanilla M1mac gives 6.16e-11, Accelerate on M1 macOS gives 3.99e-10;
## Debian with "generic" (i.e. not R's) BLAS/Lapack *still* gave 5.2985e-09 (?!)
stopifnot(all.equal(cf8.9, coef(fm8.9), tol = 7e-9))
## predict :
nd <- d8[,-1] + rep(outer(c(-2:2),10^(1:3)), 3) # 5 * 9 = 45 = 15 * 3 (nrow * ncol)
row.names(nd) <- LETTERS[1:nrow(nd)]
tools::assertWarning(verbose=TRUE, # "... rank-deficient .. consider predict(., rankdeficient="NA")
ps <- predict(fm8. , newdata=nd, rankdeficient = "simple") )
tools::assertWarning(verbose=TRUE, # "... rank-deficient .. attr(*, "non-estim") has doubtful cases
ps.<- predict(fm8. , newdata=nd) ) # default
pN <- predict(fm8. , newdata=nd, rankdeficient = "NA")
pne <- predict(fm8. , newdata=nd, rankdeficient = "non-estim")
p.9 <- predict(fm8.9, newdata=nd)
print(digits=9, cbind(ps, pne, pN, p.9))
all.equal(p.9, ps, tol=0)# 0.035..
dropAtt <- function(x) `attributes<-`(x, NULL)
stopifnot(exprs = {
ps == ps. # numbers;
identical(unname(ps), dropAtt(ps.))
identical(ps., pne) # both have "non-estim"
identical(i.ne <- attr(pne, "non-estim"),
c(K = 11L, L = 12L, N = 14L, O = 15L))
is.na(pN[i.ne])
identical(ps[-i.ne], pN[-i.ne])
})
## play with tol
str(tls <- sort(outer(c(1,2,4), 10^-(9:5))))
nT <- length(tls <- setNames(tls, formatC(tls)))
pls <- t(sapply(tls, function(TL) predict(fm8. , newdata=nd, tol = TL, rankdeficient = "NA")))
stopifnot(is.finite(plsLst <- pls[nT,])) # (no NA)
plsLst
sweep(pls, 2L, plsLst, `-`)
## This *is* monotone in tol -- still somewhat amazing how much changes
## within two factors of 4 (of tol), i.e., between 2e-7 ... 4e-6
## A B C D E F G H I J K L M N O
## 1e-09 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2e-09 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4e-09 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 1e-08 NA NA 0 NA NA NA NA 0 NA NA NA NA NA NA NA
## 2e-08 NA NA 0 NA NA NA NA 0 NA NA NA NA 0 NA NA
## 4e-08 NA NA 0 0 NA NA NA 0 NA NA NA NA 0 NA NA
## 1e-07 0 0 0 0 0 NA NA 0 0 NA NA NA 0 NA NA
## 2e-07 0 0 0 0 0 NA NA 0 0 0 NA NA 0 NA NA
## 4e-07 0 0 0 0 0 0 NA 0 0 0 NA NA 0 NA NA
## 1e-06 0 0 0 0 0 0 0 0 0 0 NA NA 0 NA NA
## 2e-06 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 NA
## 4e-06 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1e-05 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2e-05 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4e-05 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
(iFi <- apply(pls, 2, function(.) which.max(is.finite(.))))
## A B C D E F G H I J K L M N O
## 7 7 4 6 7 9 10 4 7 8 11 12 5 11 12
stopifnot(exprs = {
## checking monotonicity: each column is (NA NA ... NA | p_i p_i ... p_i)
vapply(seq_along(tls),
function(i) length(unique(pls[iFi[i]:nT, i])) == 1L,
NA)
## allow 1 off :
3 <= iFi
iFi <= 13
})
## __FIXME__
## predict(*, ... type="terms" .. ) does *not* obey rankdeficient=".."
##-------- dummy.coef() -- with "character"-factor ---------------------------------------
## [Bug 18635] New: dummy.coef could not deal with character variable // 9 Dec 2023
## --------- https://bugs.r-project.org/show_bug.cgi?id=18635
## for data generating model
ch2num <- function(ch) vapply(ch, function(.) as.integer(charToRaw(.)),
1L, USE.NAMES=FALSE)
## test:
str(print(ch2num(LETTERS)) - ch2num(letters))
set.seed(7)
mydatC <- data.frame(x = sort(rnorm(49)), ch = c(LETTERS[1:3], letters[1:4]))
mydatC$y <- with(mydatC, 20*x + 10 - (ch2num(ch) - 68) + rnorm(x))
str(mydatC)
if(dev.interactive(TRUE)) ## visualize:
plot(y ~ x, data=mydatC, col = factor(ch))
Sys.setlocale("LC_COLLATE", "C")
mydatF <- mydatC; mydatF$ch <- factor(mydatC$ch)
str(mydatF)
## $ ch: Factor w/ 7 levels "A","B","C","a",..: 1 2 3 4 5 6 7 1 2 3 ...
(sfmCc <- summary(fmCc <- lm(y ~ ., data=mydatC)))
sfmCf <- summary(fmCf <- lm(y ~ ., data=mydatF))
(ae.cf <- all.equal(sfmCc, sfmCf)) # only the call differs:
## [1] "Component “call”: target, current do not match when deparsed"
stopifnot(length(ae.cf) == 1L, grepl("^Component .call.:", ae.cf))
coef(fmCc)
## (Intercept) x chB chC cha chb chc chd
## 12.7781626 19.8494272 -0.8240301 -1.3309157 -31.7032317 -32.8819084 -33.3519985 -34.6249161
(coef(fmCf) -> cf.f) # the same
stopifnot(exprs = {
identical(coef(fmCc), cf.f)
})
(dummy.coef(fmCc) -> dc.Cc) ##-- was all wrong in R <= 4.3.2
## (Intercept): 12.77816
## x: 19.84943
## ch: A B C a b c d
## 0.0000000 -0.8240301 -1.3309157 -31.7032317 -32.8819084 -33.3519985 -34.6249161
dummy.coef(fmCf) -> dc.Cf # the same
all.equal15 <- function(x,y, ...) all.equal(x,y, tolerance = 1e-15, ...)
stopifnot(exprs = {
all.equal15(dc.Cc, dc.Cf) # *not* in R <= 4.3.2
## coef() <--> dummy.coef() {was always true}
length(dcCf <- unlist(dc.Cf)) == 1 + length(cf.f)
is.character(names(dcCf) <- sub("[.]", "", names(dcCf)))
all.equal15(dcCf[i2 <- 1:2], cf.f[i2], check.attributes = FALSE)
all.equal15(dcCf[-i2], c(chA = 0, cf.f[-i2]))
})
##============= + 2 way interactions ============================================
fm2c <- lm(y ~ .^2, data=mydatC)
cf2c <- coef(fm2c)
(dc2c <- dummy.coef(fm2c)) # *wrong* in R <= 4.3.2
stopifnot(exprs = {
length(dc2c <- unlist(dc2c)) == 2 + length(cf2c) # was false
all.equal15(dc2c[1:2], cf2c[1:2], check.attributes = FALSE)
is.character(names(dc2c) <- sub("[.]", "", names(dc2c)))
all.equal15(dc2c[-(1:2)][1:7],
c(chA = 0, cf2c[-(1:2)][1:6]))
all.equal15(tail(dc2c, 7),
c(`x:chA` = 0, tail(cf2c, 6)))
})
fm2f <- lm(y ~ .^2, data=mydatF) # was always correct
(dc2f <- dummy.coef(fm2f))
cf2f <- coef(fm2f)
stopifnot(exprs = {
## were all TRUE before
length(dc2f <- unlist(dc2f)) == 2 + length(cf2f)
all.equal(dc2f[1:2], cf2f[1:2], check.attributes = FALSE)
is.character(names(dc2f) <- sub("[.]", "", names(dc2f)))
all.equal15(dc2f[-(1:2)][1:7],
c(chA = 0, cf2f[-(1:2)][1:6]))
all.equal15(tail(dc2f, 7),
c(`x:chA` = 0, tail(cf2f, 6)))
})