2025-01-12 04:36:52 +08:00

103 lines
3.5 KiB
R

## This came from a bug report on R-help by ge yreyt <tothri2000@yahoo.ca>
## Date: Mon, 9 Jun 2003 16:06:53 -0400 (EDT)
library(cluster)
if(FALSE) # manual testing
library(cluster, lib="~/R/Pkgs/cluster.Rcheck")
data(iris)
.proctime00 <- proc.time()
mdist <- as.dist(1 - cor(t(iris[,1:4])))#dissimlarity
## this is always the same:
hc <- diana(mdist, diss = TRUE, stand = FALSE)
maxk <- 15 # at most 15 clusters
silh.wid <- numeric(maxk) # myind[k] := the silh.value for k clusters
silh.wid[1] <- NA # 1-cluster: silhouette not defined
op <- par(mfrow = c(4,4), mar = .1+ c(2,1,2,1), mgp=c(1.5, .6,0))
for(k in 2:maxk) {
cat("\n", k,":\n==\n")
k.gr <- cutree(as.hclust(hc), k = k)
cat("grouping table: "); print(table(k.gr))
si <- silhouette(k.gr, mdist)
cat("silhouette:\n"); print(summary(si))
plot(si, main = paste("k =",k),
col = 2:(k+1), do.n.k=FALSE, do.clus.stat=FALSE)
silh.wid[k] <- summary(si)$avg.width
## ===
}
par(op)
summary(si.p <- silhouette(50 - k.gr, mdist))
stopifnot(identical(si.p[,3], si[,3]),
identical(si.p[, 1:2], 50 - si[, 1:2]))
# the widths:
silh.wid
#select the number of k clusters with the largest si value :
(myk <- which.min(silh.wid)) # -> 8 (here)
postscript(file="silhouette-ex.ps")
## MM: plot to see how the decision is made
plot(silh.wid, type = 'b', col= "blue", xlab = "k")
axis(1, at=myk, col.axis= "red", font.axis= 2)
##--- PAM()'s silhouette should give same as silh*.default()!
Eq <- function(x,y, tol = 1e-12) x == y | abs(x - y) < tol * abs((x+y)/2)
for(k in 2:40) {
cat("\n", k,":\n==\n")
p.k <- pam(mdist, k = k)
k.gr <- p.k$clustering
si.p <- silhouette(p.k)
si.g <- silhouette(k.gr, mdist)
## since the obs.order may differ (within cluster):
si.g <- si.g[ as.integer(rownames(si.p)), ]
cat("grouping table: "); print(table(k.gr))
if(!isTRUE(all.equal(c(si.g), c(si.p)))) {
cat("silhouettes differ:")
if(any(neq <- !Eq(si.g[,3], si.p[,3]))) {
cat("\n")
print( cbind(si.p[], si.g[,2:3])[ neq, ] )
} else cat(" -- but not in col.3 !\n")
}
}
## "pathological" case where a_i == b_i == 0 :
D6 <- structure(c(0, 0, 0, 0.4, 1, 0.05, 1, 1, 0, 1, 1, 0, 0.25, 1, 1),
Labels = LETTERS[1:6], Size = 6, call = as.name("manually"),
class = "dist", Diag = FALSE, Upper = FALSE)
D6
kl6 <- c(1,1, 2,2, 3,3)
(skD6 <- silhouette(kl6, D6))# had one NaN
summary(skD6)
plot(silhouette(kl6, D6))# gives error in earlier cluster versions
dev.off()
## checking compatibility with R-only version
silhouetteR <- asNamespace("cluster")$silhouetteR
noCall <- function(si) `attr<-`(si, "call", NULL) # only 'call' is different:
stopifnot(all.equal(noCall(skD6), noCall(silhouetteR(kl6, D6))))
## k=1 : pam(*, k=1) works, but silhouette() is not defined;
## --- FIXME: silhouette.partition() fails: "invalid partition .."; (which is not strictly true
## ------ -> give something like NA ((or a *different* error message)
## the other methods just give NA (no object!)
## drop "call" *and* "iOrd"
noCliO <- function(si) noCall(`attr<-`(si, "iOrd", NULL))
for(k in 2:7) {
p.k <- pam(ruspini, k=k)
## order the silhouette to be *as* the default:
## spk <- silhouette(p.k); opk <- spk[order(as.numeric(rownames(spk))), ]
## rather sort*() the other:
stopifnot(all.equal(noCall(silhouette(p.k)),
noCliO(sortSilhouette(silhouetteR(p.k$clustering, p.k$diss)))))
}
## Last Line:
cat('Time elapsed: ', proc.time() - .proctime00,'\n')