103 lines
3.5 KiB
R
103 lines
3.5 KiB
R
## This came from a bug report on R-help by ge yreyt <tothri2000@yahoo.ca>
|
|
## Date: Mon, 9 Jun 2003 16:06:53 -0400 (EDT)
|
|
library(cluster)
|
|
if(FALSE) # manual testing
|
|
library(cluster, lib="~/R/Pkgs/cluster.Rcheck")
|
|
|
|
data(iris)
|
|
|
|
.proctime00 <- proc.time()
|
|
|
|
mdist <- as.dist(1 - cor(t(iris[,1:4])))#dissimlarity
|
|
## this is always the same:
|
|
hc <- diana(mdist, diss = TRUE, stand = FALSE)
|
|
|
|
maxk <- 15 # at most 15 clusters
|
|
silh.wid <- numeric(maxk) # myind[k] := the silh.value for k clusters
|
|
silh.wid[1] <- NA # 1-cluster: silhouette not defined
|
|
|
|
op <- par(mfrow = c(4,4), mar = .1+ c(2,1,2,1), mgp=c(1.5, .6,0))
|
|
for(k in 2:maxk) {
|
|
cat("\n", k,":\n==\n")
|
|
k.gr <- cutree(as.hclust(hc), k = k)
|
|
cat("grouping table: "); print(table(k.gr))
|
|
si <- silhouette(k.gr, mdist)
|
|
cat("silhouette:\n"); print(summary(si))
|
|
plot(si, main = paste("k =",k),
|
|
col = 2:(k+1), do.n.k=FALSE, do.clus.stat=FALSE)
|
|
silh.wid[k] <- summary(si)$avg.width
|
|
## ===
|
|
}
|
|
par(op)
|
|
|
|
summary(si.p <- silhouette(50 - k.gr, mdist))
|
|
stopifnot(identical(si.p[,3], si[,3]),
|
|
identical(si.p[, 1:2], 50 - si[, 1:2]))
|
|
|
|
# the widths:
|
|
silh.wid
|
|
#select the number of k clusters with the largest si value :
|
|
(myk <- which.min(silh.wid)) # -> 8 (here)
|
|
|
|
postscript(file="silhouette-ex.ps")
|
|
## MM: plot to see how the decision is made
|
|
plot(silh.wid, type = 'b', col= "blue", xlab = "k")
|
|
axis(1, at=myk, col.axis= "red", font.axis= 2)
|
|
|
|
##--- PAM()'s silhouette should give same as silh*.default()!
|
|
Eq <- function(x,y, tol = 1e-12) x == y | abs(x - y) < tol * abs((x+y)/2)
|
|
|
|
for(k in 2:40) {
|
|
cat("\n", k,":\n==\n")
|
|
p.k <- pam(mdist, k = k)
|
|
k.gr <- p.k$clustering
|
|
si.p <- silhouette(p.k)
|
|
si.g <- silhouette(k.gr, mdist)
|
|
## since the obs.order may differ (within cluster):
|
|
si.g <- si.g[ as.integer(rownames(si.p)), ]
|
|
cat("grouping table: "); print(table(k.gr))
|
|
if(!isTRUE(all.equal(c(si.g), c(si.p)))) {
|
|
cat("silhouettes differ:")
|
|
if(any(neq <- !Eq(si.g[,3], si.p[,3]))) {
|
|
cat("\n")
|
|
print( cbind(si.p[], si.g[,2:3])[ neq, ] )
|
|
} else cat(" -- but not in col.3 !\n")
|
|
}
|
|
}
|
|
|
|
|
|
## "pathological" case where a_i == b_i == 0 :
|
|
D6 <- structure(c(0, 0, 0, 0.4, 1, 0.05, 1, 1, 0, 1, 1, 0, 0.25, 1, 1),
|
|
Labels = LETTERS[1:6], Size = 6, call = as.name("manually"),
|
|
class = "dist", Diag = FALSE, Upper = FALSE)
|
|
D6
|
|
kl6 <- c(1,1, 2,2, 3,3)
|
|
(skD6 <- silhouette(kl6, D6))# had one NaN
|
|
summary(skD6)
|
|
plot(silhouette(kl6, D6))# gives error in earlier cluster versions
|
|
dev.off()
|
|
|
|
## checking compatibility with R-only version
|
|
silhouetteR <- asNamespace("cluster")$silhouetteR
|
|
noCall <- function(si) `attr<-`(si, "call", NULL) # only 'call' is different:
|
|
stopifnot(all.equal(noCall(skD6), noCall(silhouetteR(kl6, D6))))
|
|
|
|
## k=1 : pam(*, k=1) works, but silhouette() is not defined;
|
|
## --- FIXME: silhouette.partition() fails: "invalid partition .."; (which is not strictly true
|
|
## ------ -> give something like NA ((or a *different* error message)
|
|
## the other methods just give NA (no object!)
|
|
## drop "call" *and* "iOrd"
|
|
noCliO <- function(si) noCall(`attr<-`(si, "iOrd", NULL))
|
|
for(k in 2:7) {
|
|
p.k <- pam(ruspini, k=k)
|
|
## order the silhouette to be *as* the default:
|
|
## spk <- silhouette(p.k); opk <- spk[order(as.numeric(rownames(spk))), ]
|
|
## rather sort*() the other:
|
|
stopifnot(all.equal(noCall(silhouette(p.k)),
|
|
noCliO(sortSilhouette(silhouetteR(p.k$clustering, p.k$diss)))))
|
|
}
|
|
|
|
## Last Line:
|
|
cat('Time elapsed: ', proc.time() - .proctime00,'\n')
|
|
|