1145 lines
2.5 MiB
HTML
Raw Normal View History

2025-01-12 00:52:51 +08:00
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="author" content="Tal Galili" />
<meta name="date" content="2024-11-15" />
<title>Hierarchical cluster analysis on famous data sets - enhanced with the dendextend package</title>
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
var i, h, a;
for (i = 0; i < hs.length; i++) {
h = hs[i];
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
a = h.attributes;
while (a.length > 0) h.removeAttribute(a[0].name);
}
});
</script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; }
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.at { color: #7d9029; }
code span.bn { color: #40a070; }
code span.bu { color: #008000; }
code span.cf { color: #007020; font-weight: bold; }
code span.ch { color: #4070a0; }
code span.cn { color: #880000; }
code span.co { color: #60a0b0; font-style: italic; }
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.do { color: #ba2121; font-style: italic; }
code span.dt { color: #902000; }
code span.dv { color: #40a070; }
code span.er { color: #ff0000; font-weight: bold; }
code span.ex { }
code span.fl { color: #40a070; }
code span.fu { color: #06287e; }
code span.im { color: #008000; font-weight: bold; }
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.kw { color: #007020; font-weight: bold; }
code span.op { color: #666666; }
code span.ot { color: #007020; }
code span.pp { color: #bc7a00; }
code span.sc { color: #4070a0; }
code span.ss { color: #bb6688; }
code span.st { color: #4070a0; }
code span.va { color: #19177c; }
code span.vs { color: #4070a0; }
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
var sheets = document.styleSheets;
for (var i = 0; i < sheets.length; i++) {
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
var j = 0;
while (j < rules.length) {
var rule = rules[j];
// check if there is a div.sourceCode rule
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
j++;
continue;
}
var style = rule.style.cssText;
// check if color or background-color is set
if (rule.style.color === '' && rule.style.backgroundColor === '') {
j++;
continue;
}
// replace div.sourceCode by a pre.sourceCode rule
sheets[i].deleteRule(j);
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
}
}
})();
</script>
<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap;
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }
code > span.kw { color: #555; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #d14; }
code > span.fl { color: #d14; }
code > span.ch { color: #d14; }
code > span.st { color: #d14; }
code > span.co { color: #888888; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #900; font-weight: bold; }
code > span.er { color: #a61717; background-color: #e3d2d2; }
</style>
</head>
<body>
<h1 class="title toc-ignore">Hierarchical cluster analysis on famous
data sets - enhanced with the <em>dendextend</em> package</h1>
<h4 class="author">Tal Galili</h4>
<h4 class="date">2024-11-15</h4>
<div id="TOC">
<ul>
<li><a href="#introduction" id="toc-introduction">Introduction</a></li>
<li><a href="#iris---edgar-andersons-iris-data" id="toc-iris---edgar-andersons-iris-data">iris - Edgar Andersons Iris
Data</a>
<ul>
<li><a href="#background" id="toc-background">Background</a></li>
<li><a href="#the-3-clusters-from-the-complete-method-vs-the-real-species-category" id="toc-the-3-clusters-from-the-complete-method-vs-the-real-species-category">The
3 clusters from the “complete” method vs the real species
category</a></li>
<li><a href="#similaritydifference-between-various-clustering-algorithms" id="toc-similaritydifference-between-various-clustering-algorithms">Similarity/difference
between various clustering algorithms</a></li>
<li><a href="#clustering-prediction-of-the-3-species-classes" id="toc-clustering-prediction-of-the-3-species-classes">Clustering
prediction of the 3 species classes</a></li>
<li><a href="#conclusion" id="toc-conclusion">Conclusion</a></li>
</ul></li>
<li><a href="#khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes." id="toc-khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes.">khan
- Microarray gene expression data set from Khan et al., 2001. Subset of
306 genes.</a>
<ul>
<li><a href="#background-1" id="toc-background-1">Background</a></li>
<li><a href="#comparing-the-train-vs-test-dendrograms" id="toc-comparing-the-train-vs-test-dendrograms">Comparing the train vs
test dendrograms</a></li>
<li><a href="#conclusion-1" id="toc-conclusion-1">Conclusion</a></li>
</ul></li>
<li><a href="#votes.repub---votes-for-republican-candidate-in-presidential-elections" id="toc-votes.repub---votes-for-republican-candidate-in-presidential-elections">votes.repub
- Votes for Republican Candidate in Presidential Elections</a>
<ul>
<li><a href="#background-2" id="toc-background-2">Background</a></li>
<li><a href="#heatmap" id="toc-heatmap">Heatmap</a></li>
</ul></li>
<li><a href="#animals---attributes-of-animals" id="toc-animals---attributes-of-animals">animals - Attributes of
Animals</a>
<ul>
<li><a href="#background-3" id="toc-background-3">Background</a></li>
<li><a href="#heatmap-1" id="toc-heatmap-1">Heatmap</a></li>
</ul></li>
</ul>
</div>
<!--
%\VignetteEngine{knitr::rmarkdown}
%\VignetteIndexEntry{Hierarchical cluster analysis on famous data sets - enhanced with the _dendextend_ package}
-->
<div id="introduction" class="section level2">
<h2>Introduction</h2>
<p>This document demonstrates, on several famous data sets, how the
<em>dendextend</em> R package can be used to enhance Hierarchical
Cluster Analysis (through better visualization and sensitivity
analysis).</p>
</div>
<div id="iris---edgar-andersons-iris-data" class="section level2">
<h2>iris - Edgar Andersons Iris Data</h2>
<div id="background" class="section level3">
<h3>Background</h3>
<blockquote>
<p>The famous (Fishers or Andersons) iris data set gives the
measurements in centimeters of the variables sepal length and width and
petal length and width, respectively, for 50 flowers from each of 3
species of iris. The species are Iris setosa, versicolor, and virginica.
(from <code>?iris</code>)</p>
</blockquote>
<p>The <a href="https://en.wikipedia.org/wiki/Iris_flower_data_set">Iris
flower data set</a> is fun for learning supervised classification
algorithms, and is known as a difficult case for unsupervised learning.
This is easily seen through the following Scatter Plot Matrix
(SPLOM):</p>
<p>Define variables:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a>iris <span class="ot">&lt;-</span> datasets<span class="sc">::</span>iris</span>
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a>iris2 <span class="ot">&lt;-</span> iris[,<span class="sc">-</span><span class="dv">5</span>]</span>
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a>species_labels <span class="ot">&lt;-</span> iris[,<span class="dv">5</span>]</span>
<span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a><span class="fu">library</span>(colorspace) <span class="co"># get nice colors</span></span>
<span id="cb1-5"><a href="#cb1-5" tabindex="-1"></a>species_col <span class="ot">&lt;-</span> <span class="fu">rev</span>(<span class="fu">rainbow_hcl</span>(<span class="dv">3</span>))[<span class="fu">as.numeric</span>(species_labels)]</span></code></pre></div>
<p>SPLOM:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="co"># Plot a SPLOM:</span></span>
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="fu">pairs</span>(iris2, <span class="at">col =</span> species_col,</span>
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a> <span class="at">lower.panel =</span> <span class="cn">NULL</span>,</span>
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a> <span class="at">cex.labels=</span><span class="dv">2</span>, <span class="at">pch=</span><span class="dv">19</span>, <span class="at">cex =</span> <span class="fl">1.2</span>)</span>
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a></span>
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a><span class="co"># Add a legend</span></span>
<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">TRUE</span>)</span>
<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="fu">legend</span>(<span class="at">x =</span> <span class="fl">0.05</span>, <span class="at">y =</span> <span class="fl">0.4</span>, <span class="at">cex =</span> <span class="dv">2</span>,</span>
<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a> <span class="at">legend =</span> <span class="fu">as.character</span>(<span class="fu">levels</span>(species_labels)),</span>
<span id="cb2-10"><a href="#cb2-10" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">unique</span>(species_col))</span>
<span id="cb2-11"><a href="#cb2-11" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">NA</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We can see that the <em>Setosa</em> species are distinctly different
from <em>Versicolor</em> and <em>Virginica</em> (they have lower petal
length and width). But <em>Versicolor</em> and <em>Virginica</em> cannot
easily be separated based on measurements of their sepal and petal
width/length.</p>
<p>The same conclusion can be made by looking at the parallel
coordinates plot of the data:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="co"># http://blog.safaribooksonline.com/2014/03/31/mastering-parallel-coordinate-charts-r/</span></span>
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="fu">par</span>(<span class="at">las =</span> <span class="dv">1</span>, <span class="at">mar =</span> <span class="fu">c</span>(<span class="fl">4.5</span>, <span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">2</span>) <span class="sc">+</span> <span class="fl">0.1</span>, <span class="at">cex =</span> .<span class="dv">8</span>)</span>
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a>MASS<span class="sc">::</span><span class="fu">parcoord</span>(iris2, <span class="at">col =</span> species_col, <span class="at">var.label =</span> <span class="cn">TRUE</span>, <span class="at">lwd =</span> <span class="dv">2</span>)</span>
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a></span>
<span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a><span class="co"># Add Title</span></span>
<span id="cb3-6"><a href="#cb3-6" tabindex="-1"></a><span class="fu">title</span>(<span class="st">&quot;Parallel coordinates plot of the Iris data&quot;</span>)</span>
<span id="cb3-7"><a href="#cb3-7" tabindex="-1"></a><span class="co"># Add a legend</span></span>
<span id="cb3-8"><a href="#cb3-8" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">TRUE</span>)</span>
<span id="cb3-9"><a href="#cb3-9" tabindex="-1"></a><span class="fu">legend</span>(<span class="at">x =</span> <span class="fl">1.75</span>, <span class="at">y =</span> <span class="sc">-</span>.<span class="dv">25</span>, <span class="at">cex =</span> <span class="dv">1</span>,</span>
<span id="cb3-10"><a href="#cb3-10" tabindex="-1"></a> <span class="at">legend =</span> <span class="fu">as.character</span>(<span class="fu">levels</span>(species_labels)),</span>
<span id="cb3-11"><a href="#cb3-11" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">unique</span>(species_col), <span class="at">horiz =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
<p><img role="img" src="
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">NA</span>)</span></code></pre></div>
</div>
<div id="the-3-clusters-from-the-complete-method-vs-the-real-species-category" class="section level3">
<h3>The 3 clusters from the “complete” method vs the real species
category</h3>
<p>The default hierarchical clustering method in <code>hclust</code> is
“complete”. We can visualize the result of running it by turning the
object to a dendrogram and making several adjustments to the object,
such as: changing the labels, coloring the labels based on the real
species category, and coloring the branches based on cutting the tree
into three clusters.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>d_iris <span class="ot">&lt;-</span> <span class="fu">dist</span>(iris2) <span class="co"># method=&quot;man&quot; # is a bit better</span></span>
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a>hc_iris <span class="ot">&lt;-</span> <span class="fu">hclust</span>(d_iris, <span class="at">method =</span> <span class="st">&quot;complete&quot;</span>)</span>
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a>iris_species <span class="ot">&lt;-</span> <span class="fu">rev</span>(<span class="fu">levels</span>(iris[,<span class="dv">5</span>]))</span>
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a></span>
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="fu">library</span>(dendextend)</span>
<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a>dend <span class="ot">&lt;-</span> <span class="fu">as.dendrogram</span>(hc_iris)</span>
<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a><span class="co"># order it the closest we can to the order of the observations:</span></span>
<span id="cb5-8"><a href="#cb5-8" tabindex="-1"></a>dend <span class="ot">&lt;-</span> <span class="fu">rotate</span>(dend, <span class="dv">1</span><span class="sc">:</span><span class="dv">150</span>)</span>
<span id="cb5-9"><a href="#cb5-9" tabindex="-1"></a></span>
<span id="cb5-10"><a href="#cb5-10" tabindex="-1"></a><span class="co"># Color the branches based on the clusters:</span></span>
<span id="cb5-11"><a href="#cb5-11" tabindex="-1"></a>dend <span class="ot">&lt;-</span> <span class="fu">color_branches</span>(dend, <span class="at">k=</span><span class="dv">3</span>) <span class="co">#, groupLabels=iris_species)</span></span>
<span id="cb5-12"><a href="#cb5-12" tabindex="-1"></a></span>
<span id="cb5-13"><a href="#cb5-13" tabindex="-1"></a><span class="co"># Manually match the labels, as much as possible, to the real classification of the flowers:</span></span>
<span id="cb5-14"><a href="#cb5-14" tabindex="-1"></a><span class="fu">labels_colors</span>(dend) <span class="ot">&lt;-</span></span>
<span id="cb5-15"><a href="#cb5-15" tabindex="-1"></a> <span class="fu">rainbow_hcl</span>(<span class="dv">3</span>)[<span class="fu">sort_levels_values</span>(</span>
<span id="cb5-16"><a href="#cb5-16" tabindex="-1"></a> <span class="fu">as.numeric</span>(iris[,<span class="dv">5</span>])[<span class="fu">order.dendrogram</span>(dend)]</span>
<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a> )]</span>
<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a></span>
<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a><span class="co"># We shall add the flower type to the labels:</span></span>
<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a><span class="fu">labels</span>(dend) <span class="ot">&lt;-</span> <span class="fu">paste</span>(<span class="fu">as.character</span>(iris[,<span class="dv">5</span>])[<span class="fu">order.dendrogram</span>(dend)],</span>
<span id="cb5-21"><a href="#cb5-21" tabindex="-1"></a> <span class="st">&quot;(&quot;</span>,<span class="fu">labels</span>(dend),<span class="st">&quot;)&quot;</span>, </span>
<span id="cb5-22"><a href="#cb5-22" tabindex="-1"></a> <span class="at">sep =</span> <span class="st">&quot;&quot;</span>)</span>
<span id="cb5-23"><a href="#cb5-23" tabindex="-1"></a><span class="co"># We hang the dendrogram a bit:</span></span>
<span id="cb5-24"><a href="#cb5-24" tabindex="-1"></a>dend <span class="ot">&lt;-</span> <span class="fu">hang.dendrogram</span>(dend,<span class="at">hang_height=</span><span class="fl">0.1</span>)</span>
<span id="cb5-25"><a href="#cb5-25" tabindex="-1"></a><span class="co"># reduce the size of the labels:</span></span>
<span id="cb5-26"><a href="#cb5-26" tabindex="-1"></a><span class="co"># dend &lt;- assign_values_to_leaves_nodePar(dend, 0.5, &quot;lab.cex&quot;)</span></span>
<span id="cb5-27"><a href="#cb5-27" tabindex="-1"></a>dend <span class="ot">&lt;-</span> <span class="fu">set</span>(dend, <span class="st">&quot;labels_cex&quot;</span>, <span class="fl">0.5</span>)</span>
<span id="cb5-28"><a href="#cb5-28" tabindex="-1"></a><span class="co"># And plot:</span></span>
<span id="cb5-29"><a href="#cb5-29" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mar =</span> <span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">3</span>,<span class="dv">3</span>,<span class="dv">7</span>))</span>
<span id="cb5-30"><a href="#cb5-30" tabindex="-1"></a><span class="fu">plot</span>(dend, </span>
<span id="cb5-31"><a href="#cb5-31" tabindex="-1"></a> <span class="at">main =</span> <span class="st">&quot;Clustered Iris data set</span></span>
<span id="cb5-32"><a href="#cb5-32" tabindex="-1"></a><span class="st"> (the labels give the true flower species)&quot;</span>, </span>
<span id="cb5-33"><a href="#cb5-33" tabindex="-1"></a> <span class="at">horiz =</span> <span class="cn">TRUE</span>, <span class="at">nodePar =</span> <span class="fu">list</span>(<span class="at">cex =</span> .<span class="dv">007</span>))</span>
<span id="cb5-34"><a href="#cb5-34" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend =</span> iris_species, <span class="at">fill =</span> <span class="fu">rainbow_hcl</span>(<span class="dv">3</span>))</span></code></pre></div>
<p><img role="img" src="
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="do">#### BTW, notice that:</span></span>
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co"># labels(hc_iris) # no labels, because &quot;iris&quot; has no row names</span></span>
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a><span class="co"># is.integer(labels(dend)) # this could cause problems...</span></span>
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a><span class="co"># is.character(labels(dend)) # labels are no longer &quot;integer&quot;</span></span></code></pre></div>
<p>The same can be presented in a circular layout:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a><span class="co"># Requires that the circlize package will be installed</span></span>
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mar =</span> <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">4</span>))</span>
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a><span class="fu">circlize_dendrogram</span>(dend)</span></code></pre></div>
<pre><code>## Loading required namespace: circlize</code></pre>
<p><img role="img" src="
<p>These visualizations easily demonstrates how the separation of the
hierarchical clustering is very good with the “Setosa” species, but
misses in labeling many “Versicolor” species as “Virginica”.</p>
<p>The hanging of the tree also helps to locate extreme observations.
For example, we can see that observation “virginica (107)” is not very
similar to the Versicolor species, but still, it is among them. Also,
“Versicolor (71)” is located too much “within” the group of Virginica
flowers.</p>
<p>We can also explore the data using a heatmap. The rows are ordered
based on the order of the hierarchical clustering (using the “complete”
method). The colored bar indicates the species category each row belongs
to. The color in the heatmap indicates the length of each measurement
(from light yellow to dark red).</p>
<p>In the heatmap we also see how the Setosa species has low petal
values (in light yellow), but it is very difficult to see any clear
distinction between the other two species.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>some_col_func <span class="ot">&lt;-</span> <span class="cf">function</span>(n) <span class="fu">rev</span>(colorspace<span class="sc">::</span><span class="fu">heat_hcl</span>(n, <span class="at">c =</span> <span class="fu">c</span>(<span class="dv">80</span>, <span class="dv">30</span>), <span class="at">l =</span> <span class="fu">c</span>(<span class="dv">30</span>, <span class="dv">90</span>), <span class="at">power =</span> <span class="fu">c</span>(<span class="dv">1</span><span class="sc">/</span><span class="dv">5</span>, <span class="fl">1.5</span>)))</span>
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a></span>
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a><span class="co"># scaled_iris2 &lt;- iris2 %&gt;% as.matrix %&gt;% scale</span></span>
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(iris2), </span>
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a> <span class="at">main =</span> <span class="st">&quot;Heatmap for the Iris data set&quot;</span>,</span>
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">20</span>,</span>
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">&quot;row&quot;</span>,</span>
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a> <span class="at">Rowv =</span> dend,</span>
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a> <span class="at">Colv =</span> <span class="st">&quot;NA&quot;</span>, <span class="co"># this to make sure the columns are not ordered</span></span>
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a> <span class="at">trace=</span><span class="st">&quot;none&quot;</span>, </span>
<span id="cb9-12"><a href="#cb9-12" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">5</span>,<span class="fl">0.1</span>), </span>
<span id="cb9-13"><a href="#cb9-13" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">&quot;Cm&quot;</span>,</span>
<span id="cb9-14"><a href="#cb9-14" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">&quot;grey&quot;</span>,</span>
<span id="cb9-15"><a href="#cb9-15" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">&quot;density&quot;</span>,</span>
<span id="cb9-16"><a href="#cb9-16" tabindex="-1"></a> <span class="at">RowSideColors =</span> <span class="fu">rev</span>(<span class="fu">labels_colors</span>(dend)), <span class="co"># to add nice colored strips </span></span>
<span id="cb9-17"><a href="#cb9-17" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
<span id="cb9-18"><a href="#cb9-18" tabindex="-1"></a> )</span></code></pre></div>
<p><img role="img" src="
<p>We can get an interactive heatmap by using the <code>heatmaply</code>
package/function: (code is not evaluated in order to keep the HTML
size)</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>heatmaply<span class="sc">::</span><span class="fu">heatmaply</span>(<span class="fu">as.matrix</span>(iris2),</span>
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">&quot;row&quot;</span>,</span>
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a> <span class="at">Rowv =</span> dend)</span></code></pre></div>
</div>
<div id="similaritydifference-between-various-clustering-algorithms" class="section level3">
<h3>Similarity/difference between various clustering algorithms</h3>
<p>We may ask ourselves how many different results we could get if we
would use different clustering algorithms (<code>hclust</code> has 8
different algorithms implemented). For the purpose of this analysis, we
will create all 8 hclust objects, and chain them together into a single
<code>dendlist</code> object (which, as the name implies, can hold a
bunch of dendrograms together for the purpose of further analysis).</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>hclust_methods <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;ward.D&quot;</span>, <span class="st">&quot;single&quot;</span>, <span class="st">&quot;complete&quot;</span>, <span class="st">&quot;average&quot;</span>, <span class="st">&quot;mcquitty&quot;</span>, </span>
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a> <span class="st">&quot;median&quot;</span>, <span class="st">&quot;centroid&quot;</span>, <span class="st">&quot;ward.D2&quot;</span>)</span>
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a>iris_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>()</span>
<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a> hc_iris <span class="ot">&lt;-</span> <span class="fu">hclust</span>(d_iris, <span class="at">method =</span> hclust_methods[i]) </span>
<span id="cb11-6"><a href="#cb11-6" tabindex="-1"></a> iris_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(iris_dendlist, <span class="fu">as.dendrogram</span>(hc_iris))</span>
<span id="cb11-7"><a href="#cb11-7" tabindex="-1"></a>}</span>
<span id="cb11-8"><a href="#cb11-8" tabindex="-1"></a><span class="fu">names</span>(iris_dendlist) <span class="ot">&lt;-</span> hclust_methods</span>
<span id="cb11-9"><a href="#cb11-9" tabindex="-1"></a>iris_dendlist</span></code></pre></div>
<pre><code>## $ward.D
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 199.6205
##
## $single
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 1.640122
##
## $complete
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 7.085196
##
## $average
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 4.062683
##
## $mcquitty
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 4.497283
##
## $median
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 2.82744
##
## $centroid
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 2.994307
##
## $ward.D2
## &#39;dendrogram&#39; with 2 branches and 150 members total, at height 32.44761
##
## attr(,&quot;class&quot;)
## [1] &quot;dendlist&quot;</code></pre>
<p>Next, we can look at the cophenetic correlation between each
clustering result using <code>cor.dendlist</code>. (This can be nicely
plotted using the <code>corrplot</code> function from the
<em>corrplot</em> package):</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a>iris_dendlist_cor <span class="ot">&lt;-</span> <span class="fu">cor.dendlist</span>(iris_dendlist)</span>
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a>iris_dendlist_cor</span></code></pre></div>
<pre><code>## ward.D single complete average mcquitty median centroid
## ward.D 1.0000000 0.9836838 0.5774013 0.9841333 0.9641103 0.9451815 0.9809088
## single 0.9836838 1.0000000 0.5665529 0.9681156 0.9329029 0.9444723 0.9903934
## complete 0.5774013 0.5665529 1.0000000 0.6195121 0.6107473 0.6889092 0.5870062
## average 0.9841333 0.9681156 0.6195121 1.0000000 0.9828015 0.9449422 0.9801444
## mcquitty 0.9641103 0.9329029 0.6107473 0.9828015 1.0000000 0.9203374 0.9499123
## median 0.9451815 0.9444723 0.6889092 0.9449422 0.9203374 1.0000000 0.9403569
## centroid 0.9809088 0.9903934 0.5870062 0.9801444 0.9499123 0.9403569 1.0000000
## ward.D2 0.9911648 0.9682507 0.6096286 0.9895131 0.9829977 0.9445832 0.9737886
## ward.D2
## ward.D 0.9911648
## single 0.9682507
## complete 0.6096286
## average 0.9895131
## mcquitty 0.9829977
## median 0.9445832
## centroid 0.9737886
## ward.D2 1.0000000</code></pre>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(iris_dendlist_cor, <span class="st">&quot;pie&quot;</span>, <span class="st">&quot;lower&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
<p>From the above figure, we can easily see that most clustering methods
yield very similar results, except for the complete method (the default
method in <code>hclust</code>), which yields a correlation measure of
around 0.6.</p>
<p>The default cophenetic correlation uses pearsons measure, but what
if we use the spearmans correlation coefficient?</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>iris_dendlist_cor_spearman <span class="ot">&lt;-</span> <span class="fu">cor.dendlist</span>(iris_dendlist, <span class="at">method_coef =</span> <span class="st">&quot;spearman&quot;</span>)</span>
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(iris_dendlist_cor_spearman, <span class="st">&quot;pie&quot;</span>, <span class="st">&quot;lower&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We can see that the correlations are not so strong, indicating a
behavior that is dependent on some items which are very distant from one
another having an influence on the pearsons correlation more than that
of the spearmans correlation.</p>
<p>To further explore the similarity and difference between the
alternative clustering algorithms, we can turn to using the
<code>tanglegram</code> function (which works for either two
<code>dendrogram</code>s, or a <code>dendlist</code>).</p>
<p>First, let us see two methods which are very similar: ward.D vs
ward.D2. From a first glance, we can see how they both give the same
result for the top 3 clusters. However, since they are both ladderizes
(i.e.: having their smaller branch rotated to be higher for each node),
we can see that their clustering is not identical (due to the
crossings).</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a>iris_dendlist <span class="sc">%&gt;%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">8</span>)) <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span> </span>
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">&quot;branches_k_color&quot;</span>, <span class="at">k=</span><span class="dv">3</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a> <span class="co"># untangle(method = &quot;step1side&quot;, k_seq = 3:20) %&gt;%</span></span>
<span id="cb17-5"><a href="#cb17-5" tabindex="-1"></a> <span class="co"># set(&quot;clear_branches&quot;) %&gt;% #otherwise the single lines are not black, since they retain the previous color from the branches_k_color.</span></span>
<span id="cb17-6"><a href="#cb17-6" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
<p><img role="img" src="
<p>Next, let us look at two methods which also have a high cophenetic
correlation: ward.D vs the average:</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a>iris_dendlist <span class="sc">%&gt;%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">4</span>)) <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span> </span>
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">&quot;branches_k_color&quot;</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a> <span class="co"># untangle(method = &quot;step1side&quot;, k_seq = 3:20) %&gt;%</span></span>
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
<p><img role="img" src="
<p>We see that when it comes to the major clusters, the two algorithms
perform quite similarly.</p>
<p>However, how are they doing inside each of the clusters? It is quite
difficult to compare the two because of the high value in ward.D. For
comparison purposes, we can “rank” the heights of the branches in the
two dendrograms (while still preserving their internal order). Next, we
can highlight the shared common sub-trees (with different colors), and
the distinct edges (with a dashed line):</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>iris_dendlist <span class="sc">%&gt;%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">4</span>)) <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span> </span>
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a> <span class="co"># untangle(method = &quot;step1side&quot;, k_seq = 3:20) %&gt;%</span></span>
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">&quot;rank_branches&quot;</span>) <span class="sc">%&gt;%</span></span>
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">common_subtrees_color_branches =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We have 39 sub-trees that are identical between the two
dendrograms:</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a><span class="fu">length</span>(<span class="fu">unique</span>(<span class="fu">common_subtrees_clusters</span>(iris_dendlist[[<span class="dv">1</span>]], iris_dendlist[[<span class="dv">4</span>]]))[<span class="sc">-</span><span class="dv">1</span>])</span></code></pre></div>
<pre><code>## [1] 39</code></pre>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a><span class="co"># -1 at the end is because we are ignoring the &quot;0&quot; subtree, which indicates leaves that are singletons.</span></span></code></pre></div>
<p>What we can learn from this is that actually the two algorithms seem
to give quite different results in the high resolution (higher cuts).
However, since both capture the two major clusters (Setosa vs the
others), they are considered quite similar by the cophenetic
correlation.</p>
<p>But what about the “complete” method (that got a lower cophenetic
correlation than the other methods)? When we compare “complete” vs
“average”, we can quickly see that in the “complete” method, the
splitting of the clusters is much more balanced, and mixes the “Setosa”
species with another one. This is probably the cause for the big
difference found in the cophenetic correlation between the “complete
method” and the other clustering methods:</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a>iris_dendlist <span class="sc">%&gt;%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">4</span>)) <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span> </span>
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a> <span class="fu">untangle</span>(<span class="at">method =</span> <span class="st">&quot;step1side&quot;</span>, <span class="at">k_seq =</span> <span class="dv">2</span><span class="sc">:</span><span class="dv">6</span>) <span class="sc">%&gt;%</span></span>
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">&quot;branches_k_color&quot;</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
<p><img role="img" src="
<p>We can quickly plot all 8 methods to see this phenomenon (i.e.: that
“complete” has its smaller cluster larger than it is in all the other
clustering methods):</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow =</span> <span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>))</span>
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">8</span>) {</span>
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a> iris_dendlist[[i]] <span class="sc">%&gt;%</span> <span class="fu">set</span>(<span class="st">&quot;branches_k_color&quot;</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%&gt;%</span> <span class="fu">plot</span>(<span class="at">axes =</span> <span class="cn">FALSE</span>, <span class="at">horiz =</span> <span class="cn">TRUE</span>)</span>
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a> <span class="fu">title</span>(<span class="fu">names</span>(iris_dendlist)[i])</span>
<span id="cb24-5"><a href="#cb24-5" tabindex="-1"></a>}</span></code></pre></div>
<p><img role="img" src="
<p>It seems that the cophenetic correlation is very biased towards the
influence of the main clusters. Another correlation measure to use is
the <code>cor_common_nodes</code> correlation (giving the proportion of
nodes which share the exact same list of labels in both dendrograms). We
can also check it out:</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a>iris_dendlist_cor2 <span class="ot">&lt;-</span> <span class="fu">cor.dendlist</span>(iris_dendlist, <span class="at">method =</span> <span class="st">&quot;common&quot;</span>)</span>
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a>iris_dendlist_cor2</span></code></pre></div>
<pre><code>## ward.D single complete average mcquitty median centroid
## ward.D 1.0000000 0.7324415 0.8595318 0.8461538 0.8361204 0.7458194 0.7324415
## single 0.7324415 1.0000000 0.7324415 0.7491639 0.7458194 0.7591973 0.7625418
## complete 0.8595318 0.7324415 1.0000000 0.8060201 0.7993311 0.7491639 0.7290970
## average 0.8461538 0.7491639 0.8060201 1.0000000 0.8494983 0.7892977 0.7725753
## mcquitty 0.8361204 0.7458194 0.7993311 0.8494983 1.0000000 0.7859532 0.7759197
## median 0.7458194 0.7591973 0.7491639 0.7892977 0.7859532 1.0000000 0.8528428
## centroid 0.7324415 0.7625418 0.7290970 0.7725753 0.7759197 0.8528428 1.0000000
## ward.D2 0.8795987 0.7324415 0.8294314 0.8294314 0.8294314 0.7558528 0.7357860
## ward.D2
## ward.D 0.8795987
## single 0.7324415
## complete 0.8294314
## average 0.8294314
## mcquitty 0.8294314
## median 0.7558528
## centroid 0.7357860
## ward.D2 1.0000000</code></pre>
<p>And plot it:</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a><span class="co"># corrplot::corrplot(iris_dendlist_cor2, &quot;pie&quot;, &quot;lower&quot;)</span></span></code></pre></div>
<p>This gives us another perspective on our clustering algorithms. We
can see that most methods have around 75% common nodes with one another.
Centroid and median seem relatively close to one another, as well as
ward.D2 and ward.D to one another and to complete, average, and mcquitty
(as compared to the other methods).</p>
</div>
<div id="clustering-prediction-of-the-3-species-classes" class="section level3">
<h3>Clustering prediction of the 3 species classes</h3>
<p>Lastly, we would like to see which of the different clustering
algorithms came the closest to detecting the 3 flower species (when
using a cut of k=3).</p>
<p>For this purpose, we compare the clustering solution of each
algorithm with the real clusters, using the Fowlkes-Mallows Index (also
using in the package for the <code>Bk_plot</code>). This measure is
similar to rand (or rand adjusted) index, and gives a value of 1 when
the two clusters confirm, and 0 when they do not.</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a>get_ordered_3_clusters <span class="ot">&lt;-</span> <span class="cf">function</span>(dend) {</span>
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a> <span class="fu">cutree</span>(dend, <span class="at">k =</span> <span class="dv">3</span>)[<span class="fu">order.dendrogram</span>(dend)]</span>
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a>}</span>
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a></span>
<span id="cb28-5"><a href="#cb28-5" tabindex="-1"></a>dend_3_clusters <span class="ot">&lt;-</span> <span class="fu">lapply</span>(iris_dendlist, get_ordered_3_clusters)</span>
<span id="cb28-6"><a href="#cb28-6" tabindex="-1"></a></span>
<span id="cb28-7"><a href="#cb28-7" tabindex="-1"></a>compare_clusters_to_iris <span class="ot">&lt;-</span> <span class="cf">function</span>(clus) {<span class="fu">FM_index</span>(clus, <span class="fu">rep</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>, <span class="at">each =</span> <span class="dv">50</span>), <span class="at">assume_sorted_vectors =</span> <span class="cn">TRUE</span>)}</span>
<span id="cb28-8"><a href="#cb28-8" tabindex="-1"></a></span>
<span id="cb28-9"><a href="#cb28-9" tabindex="-1"></a>clusters_performance <span class="ot">&lt;-</span> <span class="fu">sapply</span>(dend_3_clusters, compare_clusters_to_iris)</span>
<span id="cb28-10"><a href="#cb28-10" tabindex="-1"></a><span class="fu">dotchart</span>(<span class="fu">sort</span>(clusters_performance), <span class="at">xlim =</span> <span class="fu">c</span>(<span class="fl">0.7</span>,<span class="dv">1</span>),</span>
<span id="cb28-11"><a href="#cb28-11" tabindex="-1"></a> <span class="at">xlab =</span> <span class="st">&quot;Fowlkes-Mallows Index (from 0 to 1)&quot;</span>,</span>
<span id="cb28-12"><a href="#cb28-12" tabindex="-1"></a> <span class="at">main =</span> <span class="st">&quot;Perormance of clustering algorithms </span><span class="sc">\n</span><span class="st"> in detecting the 3 species&quot;</span>,</span>
<span id="cb28-13"><a href="#cb28-13" tabindex="-1"></a> <span class="at">pch =</span> <span class="dv">19</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We can see that the “median” method did the best, although similar
results were achieved by ward.D2, average, ward.D, and mcquitty.
However, the complete, centroid, and single method did worse in our
case.</p>
</div>
<div id="conclusion" class="section level3">
<h3>Conclusion</h3>
<p>The Iris data set is only 4-dimensional, making it possible to
explore using pairs plot (SPLOM) or parallel coordinates plot. It is
clear from these that two main clusters are visible, while the
separation of the third cluster is difficult.</p>
<p>In the above analysis, we learned that the complete method fails to
do the proper separation of the two main clusters when cut in k=2 (but
succeeds in doing it, if moving to k=3 clusters). This is different from
all the other 7 methods available in <code>hclust</code>, which do
succeed in separating the 2 main clusters from the beginning (i.e.: for
k=2).</p>
<p>We also noticed that all clustering algorithms share a relatively
high proportion of common nodes (between 75% to 90%).</p>
<p>Lastly, when it came to trying to separating the flowers into 3
species, the median clustering method did the best, while the single
method did the worst in this regard.</p>
<p>While the Iris data set is well known, I hope the above analysis was
able to offer some new perspectives on the performance of the different
hierarchical clustering methods.</p>
</div>
</div>
<div id="khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes." class="section level2">
<h2>khan - Microarray gene expression data set from Khan et al., 2001.
Subset of 306 genes.</h2>
<div id="background-1" class="section level3">
<h3>Background</h3>
<blockquote>
<p>Khan contains gene expression profiles of four types of small, round,
blue cell tumors of childhood (SRBCT) published by Khan et al. (2001).
It also contains further gene annotation retrieved from SOURCE at <a href="http://source.stanford.edu/" class="uri">http://source.stanford.edu/</a>.</p>
</blockquote>
<p>This interesting data set offers two interesting items:</p>
<ul>
<li>train: data.frame of 306 rows and 64 columns. The training data set
of 64 arrays and 306 gene expression values</li>
<li>test: data.frame, of 306 rows and 25 columns. The test data set of
25 arrays and 306 genes expression values</li>
</ul>
<p>This way we can create a hierarchical clustering on the 306 genes
expression values on the train and the test data and compare the two to
see the stability of the results.</p>
<p>We define the variables:</p>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" tabindex="-1"></a>train <span class="ot">&lt;-</span> dendextend<span class="sc">::</span>khan<span class="sc">$</span>train</span>
<span id="cb29-2"><a href="#cb29-2" tabindex="-1"></a>test <span class="ot">&lt;-</span> dendextend<span class="sc">::</span>khan<span class="sc">$</span>test</span></code></pre></div>
<p>And create the dendrograms:</p>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" tabindex="-1"></a>d_train <span class="ot">&lt;-</span> train <span class="sc">%&gt;%</span> dist <span class="sc">%&gt;%</span> hclust <span class="sc">%&gt;%</span> as.dendrogram</span>
<span id="cb30-2"><a href="#cb30-2" tabindex="-1"></a>d_test <span class="ot">&lt;-</span> test <span class="sc">%&gt;%</span> dist <span class="sc">%&gt;%</span> hclust <span class="sc">%&gt;%</span> as.dendrogram</span>
<span id="cb30-3"><a href="#cb30-3" tabindex="-1"></a>d_train_test <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(<span class="at">train =</span> d_train, <span class="at">test =</span> d_test)</span></code></pre></div>
</div>
<div id="comparing-the-train-vs-test-dendrograms" class="section level3">
<h3>Comparing the train vs test dendrograms</h3>
<p>Using a cophenetic correlation, we can see the two trees have some
similarity (0.57):</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" tabindex="-1"></a>d_train_test <span class="sc">%&gt;%</span> cor.dendlist</span></code></pre></div>
<pre><code>## train test
## train 1.0000000 0.5708019
## test 0.5708019 1.0000000</code></pre>
<p>However, when looking at the cophenetic correlation with the spearman
correlation coefficiant, the value is lower (0.49) indicating that some
of the similarity is due to a small number of items, distant from the
others, which are correlated similarly in the two trees:</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" tabindex="-1"></a>d_train_test <span class="sc">%&gt;%</span> <span class="fu">cor.dendlist</span>(<span class="at">method_coef =</span> <span class="st">&quot;spearman&quot;</span>)</span></code></pre></div>
<pre><code>## train test
## train 1.0000000 0.4971936
## test 0.4971936 1.0000000</code></pre>
<p>We may ask at which level of cutting the dendrogram we get the “best”
level of similarity. For this we may turn to the Bk plot. The plots
shows us that at around 7 clusters the groups in the two are starting to
look significantly similar. (Note that significantly does not mean
substantially)</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" tabindex="-1"></a><span class="fu">Bk_plot</span>(d_train, d_test, <span class="at">k =</span> <span class="dv">2</span><span class="sc">:</span><span class="dv">30</span>, <span class="at">xlim =</span> <span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">30</span>))</span></code></pre></div>
<p><img role="img" src="
<p>Next, we compare the results with a tanglegram. We make sure to color
the connecting line with the colors of the branches of the train (left)
dendrogram. This can help us see which patterns are somewhat preserved
between the two trees.</p>
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" tabindex="-1"></a>pre_tang_d_train_test <span class="ot">&lt;-</span> d_train_test <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span> <span class="co"># untangle %&gt;%</span></span>
<span id="cb36-2"><a href="#cb36-2" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">&quot;branches_k_color&quot;</span>, <span class="at">k =</span> <span class="dv">7</span>)</span>
<span id="cb36-3"><a href="#cb36-3" tabindex="-1"></a>train_branches_colors <span class="ot">&lt;-</span> <span class="fu">get_leaves_branches_col</span>(pre_tang_d_train_test<span class="sc">$</span>train)</span>
<span id="cb36-4"><a href="#cb36-4" tabindex="-1"></a>pre_tang_d_train_test <span class="sc">%&gt;%</span> <span class="fu">tanglegram</span>(<span class="at">fast =</span> <span class="cn">TRUE</span>, <span class="at">color_lines =</span> train_branches_colors)</span></code></pre></div>
<p><img role="img" src="
<p>We can see that the top most (small) cluster is somewhat preserved
between the two trees. However, a large spaghetti-like tangle of lines
is indicating that the two trees are far from being identical.</p>
<p>If we look only at subtrees of the two dendrograms so that they
include only genes that are clustered with genes in both trees, we get
only 14 genes (while the original trees had 306 genes). We can see how
we have several groups of pairs of genes, and one group with four genes
clustered together in both trees:</p>
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" tabindex="-1"></a><span class="co"># This was calculated before</span></span>
<span id="cb37-2"><a href="#cb37-2" tabindex="-1"></a><span class="co"># d_train_test_common &lt;- d_train_test %&gt;% prune_common_subtrees.dendlist</span></span>
<span id="cb37-3"><a href="#cb37-3" tabindex="-1"></a><span class="co"># d_train_test_common</span></span>
<span id="cb37-4"><a href="#cb37-4" tabindex="-1"></a>d_train_test_common <span class="sc">%&gt;%</span> untangle <span class="sc">%&gt;%</span> <span class="fu">tanglegram</span>(<span class="at">common_subtrees_color_branches =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
<p><img role="img" src="
<p>Trees sizes:</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" tabindex="-1"></a>d_train_test <span class="sc">%&gt;%</span> nleaves</span></code></pre></div>
<pre><code>## train test
## 306 306</code></pre>
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" tabindex="-1"></a>d_train_test_common <span class="sc">%&gt;%</span> nleaves</span></code></pre></div>
<pre><code>## train test
## 14 14</code></pre>
</div>
<div id="conclusion-1" class="section level3">
<h3>Conclusion</h3>
<p>To conclude: we see that the clustering algorithm resulted in trees
which are significantly similar in both the training and the test data
sets beyond chance, but that this similarity is restricted to only a
very small proportion of genes.</p>
</div>
</div>
<div id="votes.repub---votes-for-republican-candidate-in-presidential-elections" class="section level2">
<h2>votes.repub - Votes for Republican Candidate in Presidential
Elections</h2>
<div id="background-2" class="section level3">
<h3>Background</h3>
<blockquote>
<p>This is a data frame with the percentage of votes given to the
republican candidate in presidential elections from 1856 to 1976. Rows
represent the 50 states, and columns the 31 elections.</p>
</blockquote>
<blockquote>
<p>Source: S. Peterson (1973): A Statistical History of the American
Presidential Elections. New York: Frederick Ungar Publishing Co. Data
from 1964 to 1976 is from R. M. Scammon, American Votes 12,
Congressional Quarterly.</p>
</blockquote>
<p>Define variables:</p>
<div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" tabindex="-1"></a>votes.repub <span class="ot">&lt;-</span> cluster<span class="sc">::</span>votes.repub</span></code></pre></div>
<p>These data can be visualized using a (costumed made) parallel
coordinates plot:</p>
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" tabindex="-1"></a>years <span class="ot">&lt;-</span> <span class="fu">as.numeric</span>(<span class="fu">gsub</span>(<span class="st">&quot;X&quot;</span>, <span class="st">&quot;&quot;</span>, <span class="fu">colnames</span>(votes.repub)))</span>
<span id="cb43-2"><a href="#cb43-2" tabindex="-1"></a></span>
<span id="cb43-3"><a href="#cb43-3" tabindex="-1"></a><span class="fu">par</span>(<span class="at">las =</span> <span class="dv">2</span>, <span class="at">mar =</span> <span class="fu">c</span>(<span class="fl">4.5</span>, <span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">2</span>) <span class="sc">+</span> <span class="fl">0.1</span>, <span class="at">cex =</span> .<span class="dv">8</span>)</span>
<span id="cb43-4"><a href="#cb43-4" tabindex="-1"></a><span class="co"># MASS::parcoord(votes.repub, var.label = FALSE, lwd = 1)</span></span>
<span id="cb43-5"><a href="#cb43-5" tabindex="-1"></a><span class="fu">matplot</span>(<span class="dv">1</span><span class="dt">L</span><span class="sc">:</span><span class="fu">ncol</span>(votes.repub), <span class="fu">t</span>(votes.repub), <span class="at">type =</span> <span class="st">&quot;l&quot;</span>, <span class="at">col =</span> <span class="dv">1</span>, <span class="at">lty =</span> <span class="dv">1</span>,</span>
<span id="cb43-6"><a href="#cb43-6" tabindex="-1"></a> <span class="at">axes =</span> F, <span class="at">xlab =</span> <span class="st">&quot;&quot;</span>, <span class="at">ylab =</span> <span class="st">&quot;&quot;</span>)</span>
<span id="cb43-7"><a href="#cb43-7" tabindex="-1"></a><span class="fu">axis</span>(<span class="dv">1</span>, <span class="at">at =</span> <span class="fu">seq_along</span>(years), <span class="at">labels =</span> years)</span>
<span id="cb43-8"><a href="#cb43-8" tabindex="-1"></a><span class="fu">axis</span>(<span class="dv">2</span>)</span>
<span id="cb43-9"><a href="#cb43-9" tabindex="-1"></a><span class="co"># Add Title</span></span>
<span id="cb43-10"><a href="#cb43-10" tabindex="-1"></a><span class="fu">title</span>(<span class="st">&quot;Votes for Republican Candidate</span><span class="sc">\n</span><span class="st"> in Presidential Elections </span><span class="sc">\n</span><span class="st"> (each line is a country - over the years)&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
</div>
<div id="heatmap" class="section level3">
<h3>Heatmap</h3>
<p>This is a nice example when the parallel coordinates plot has some
serious limitations: it does not help us detect the states, we fail to
see the missing value patterns, and it is tricky to see clusters in
general (due to the large number of threads).</p>
<p>For these data, it can be quite helpful to see a heatmap of the votes
across the years. The ordering of the rows is tricky. First, the
distance of the vectors (later used for the clustering) should be done
after transformation (since we are dealing with proportion of votes). In
this case, I used the arcsin transformation (a logit transformation
could also work, but the arcsin is safer for dealing with 0/1
observations). But given the clusters, we wish to order the leaves (as
much as possible), in order to take into account the missing value
clusterings. So we, in fact, have two clusters, one for the raw values,
and another for the “shadow matrix” (i.e.: the matrix with 0/1,
indicating if a value was missing or not).</p>
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" tabindex="-1"></a>arcsin_transformation <span class="ot">&lt;-</span> <span class="cf">function</span>(x) <span class="fu">asin</span>(x<span class="sc">/</span><span class="dv">100</span>)</span>
<span id="cb44-2"><a href="#cb44-2" tabindex="-1"></a></span>
<span id="cb44-3"><a href="#cb44-3" tabindex="-1"></a>dend_NA <span class="ot">&lt;-</span> votes.repub <span class="sc">%&gt;%</span> is.na <span class="sc">%&gt;%</span></span>
<span id="cb44-4"><a href="#cb44-4" tabindex="-1"></a> dist <span class="sc">%&gt;%</span> hclust <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span> ladderize</span>
<span id="cb44-5"><a href="#cb44-5" tabindex="-1"></a></span>
<span id="cb44-6"><a href="#cb44-6" tabindex="-1"></a>dend <span class="ot">&lt;-</span> votes.repub <span class="sc">%&gt;%</span> arcsin_transformation <span class="sc">%&gt;%</span></span>
<span id="cb44-7"><a href="#cb44-7" tabindex="-1"></a> dist <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">&quot;com&quot;</span>) <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span></span>
<span id="cb44-8"><a href="#cb44-8" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%&gt;%</span></span>
<span id="cb44-9"><a href="#cb44-9" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>)</span>
<span id="cb44-10"><a href="#cb44-10" tabindex="-1"></a></span>
<span id="cb44-11"><a href="#cb44-11" tabindex="-1"></a><span class="co"># some_col_func &lt;- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))</span></span>
<span id="cb44-12"><a href="#cb44-12" tabindex="-1"></a>some_col_func <span class="ot">&lt;-</span> colorspace<span class="sc">::</span>diverge_hcl</span>
<span id="cb44-13"><a href="#cb44-13" tabindex="-1"></a></span>
<span id="cb44-14"><a href="#cb44-14" tabindex="-1"></a></span>
<span id="cb44-15"><a href="#cb44-15" tabindex="-1"></a><span class="co"># par(mar = c(3,3,3,3))</span></span>
<span id="cb44-16"><a href="#cb44-16" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
<span id="cb44-17"><a href="#cb44-17" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(votes.repub), </span>
<span id="cb44-18"><a href="#cb44-18" tabindex="-1"></a> <span class="at">main =</span> <span class="st">&quot;Votes for</span><span class="sc">\n</span><span class="st"> Republican Presidential Candidate</span><span class="sc">\n</span><span class="st"> (clustered using complete)&quot;</span>,</span>
<span id="cb44-19"><a href="#cb44-19" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">60</span>,</span>
<span id="cb44-20"><a href="#cb44-20" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">&quot;row&quot;</span>,</span>
<span id="cb44-21"><a href="#cb44-21" tabindex="-1"></a> <span class="at">Rowv =</span> dend,</span>
<span id="cb44-22"><a href="#cb44-22" tabindex="-1"></a> <span class="at">Colv =</span> <span class="st">&quot;NA&quot;</span>, <span class="co"># this to make sure the columns are not ordered</span></span>
<span id="cb44-23"><a href="#cb44-23" tabindex="-1"></a> <span class="at">trace=</span><span class="st">&quot;none&quot;</span>, </span>
<span id="cb44-24"><a href="#cb44-24" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">6</span>), </span>
<span id="cb44-25"><a href="#cb44-25" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">&quot;% Votes for Republican</span><span class="sc">\n</span><span class="st"> Presidential Candidate&quot;</span>,</span>
<span id="cb44-26"><a href="#cb44-26" tabindex="-1"></a> <span class="at">labCol =</span> years,</span>
<span id="cb44-27"><a href="#cb44-27" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">&quot;grey&quot;</span>,</span>
<span id="cb44-28"><a href="#cb44-28" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">&quot;density&quot;</span>,</span>
<span id="cb44-29"><a href="#cb44-29" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
<span id="cb44-30"><a href="#cb44-30" tabindex="-1"></a> )</span></code></pre></div>
<p><img role="img" src="
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" tabindex="-1"></a> <span class="co"># RowSideColors = rev(labels_colors(dend)), # to add nice colored strips </span></span></code></pre></div>
<p>How much of a difference would we get if we used another clustering
algorithm?</p>
<p>We first calculate the clustering using 8 different methods:</p>
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" tabindex="-1"></a>hclust_methods <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;ward.D&quot;</span>, <span class="st">&quot;single&quot;</span>, <span class="st">&quot;complete&quot;</span>, <span class="st">&quot;average&quot;</span>, <span class="st">&quot;mcquitty&quot;</span>, </span>
<span id="cb46-2"><a href="#cb46-2" tabindex="-1"></a> <span class="st">&quot;median&quot;</span>, <span class="st">&quot;centroid&quot;</span>, <span class="st">&quot;ward.D2&quot;</span>)</span>
<span id="cb46-3"><a href="#cb46-3" tabindex="-1"></a>votes.repub_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>()</span>
<span id="cb46-4"><a href="#cb46-4" tabindex="-1"></a></span>
<span id="cb46-5"><a href="#cb46-5" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
<span id="cb46-6"><a href="#cb46-6" tabindex="-1"></a> tmp_dend <span class="ot">&lt;-</span> votes.repub <span class="sc">%&gt;%</span> arcsin_transformation <span class="sc">%&gt;%</span> dist <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> hclust_methods[i]) <span class="sc">%&gt;%</span> as.dendrogram </span>
<span id="cb46-7"><a href="#cb46-7" tabindex="-1"></a> votes.repub_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(votes.repub_dendlist, tmp_dend)</span>
<span id="cb46-8"><a href="#cb46-8" tabindex="-1"></a>}</span>
<span id="cb46-9"><a href="#cb46-9" tabindex="-1"></a><span class="fu">names</span>(votes.repub_dendlist) <span class="ot">&lt;-</span> hclust_methods</span>
<span id="cb46-10"><a href="#cb46-10" tabindex="-1"></a><span class="co"># votes.repub_dendlist</span></span></code></pre></div>
<p>Next, we can look at the cophenetic correlation between each
clustering result using <code>cor.dendlist</code>. (This can be nicely
plotted using the <code>corrplot</code> function from the
<em>corrplot</em> package):</p>
<div class="sourceCode" id="cb47"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(<span class="fu">cor.dendlist</span>(votes.repub_dendlist), <span class="st">&quot;pie&quot;</span>, <span class="st">&quot;lower&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We see that the “complete” method is somewhat similar to the
ward.D/ward.D2 methods, but there is less similarity with the other
methods. We can see that the methods “average”, “mcquitty” and “median”,
all give somewhat similar results. So by using “average”, we will see an
alternative presentation that represents (in a sense) three other
clustering solutions.</p>
<p>We can look at the heatmap of the “average” method. However, as you
can see, it is not very helpful in seeing the difference between the two
clustering solutions.</p>
<p><img role="img" src="
<p>Lets look at the tanglegram of the two methods to get a better
insight into the differences between the two:</p>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" tabindex="-1"></a>dend_com <span class="ot">&lt;-</span> votes.repub <span class="sc">%&gt;%</span> arcsin_transformation <span class="sc">%&gt;%</span></span>
<span id="cb48-2"><a href="#cb48-2" tabindex="-1"></a> dist <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">&quot;com&quot;</span>) <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span></span>
<span id="cb48-3"><a href="#cb48-3" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%&gt;%</span></span>
<span id="cb48-4"><a href="#cb48-4" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>) <span class="co"># %&gt;% ladderize</span></span>
<span id="cb48-5"><a href="#cb48-5" tabindex="-1"></a>dend_ave <span class="ot">&lt;-</span> votes.repub <span class="sc">%&gt;%</span> arcsin_transformation <span class="sc">%&gt;%</span></span>
<span id="cb48-6"><a href="#cb48-6" tabindex="-1"></a> dist <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">&quot;ave&quot;</span>) <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span></span>
<span id="cb48-7"><a href="#cb48-7" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%&gt;%</span></span>
<span id="cb48-8"><a href="#cb48-8" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>) <span class="co"># %&gt;% ladderize</span></span>
<span id="cb48-9"><a href="#cb48-9" tabindex="-1"></a></span>
<span id="cb48-10"><a href="#cb48-10" tabindex="-1"></a><span class="co"># The orders were predefined after using untangle(&quot;step2side&quot;)</span></span>
<span id="cb48-11"><a href="#cb48-11" tabindex="-1"></a><span class="co"># They are omitted here to save running time.</span></span>
<span id="cb48-12"><a href="#cb48-12" tabindex="-1"></a>dend_com <span class="ot">&lt;-</span> <span class="fu">rotate</span>(dend_com, ord1)</span>
<span id="cb48-13"><a href="#cb48-13" tabindex="-1"></a>dend_ave <span class="ot">&lt;-</span> <span class="fu">rotate</span>(dend_ave, ord2)</span>
<span id="cb48-14"><a href="#cb48-14" tabindex="-1"></a></span>
<span id="cb48-15"><a href="#cb48-15" tabindex="-1"></a>dends <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(<span class="at">complete =</span> dend_com, <span class="at">average =</span> dend_ave) <span class="co"># %&gt;% untangle(&quot;step2side&quot;)</span></span>
<span id="cb48-16"><a href="#cb48-16" tabindex="-1"></a>dends <span class="sc">%&gt;%</span> <span class="fu">tanglegram</span>(<span class="at">margin_inner =</span> <span class="dv">7</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We see that the two clusterings give similar results for: “Alabama”,
“Georgia”, “Louisiana”, “Arkansas”, “Florida”, “Texas”, “South
Carolina”, “Mississippi”.</p>
<p>There are also several other sub-trees which are identical between
the two methods. The biggest difference lies in several “rouge” states
that are placed differently in the two clustering algorithms. They are:
Vermont, Michigan, Maine, Hawaii, New Jersey, West Virginia, and
Oklahoma.</p>
<p>A better understanding of the data requires a much more in-depth
historical perspective than is within the scope of the current
analysis.</p>
</div>
</div>
<div id="animals---attributes-of-animals" class="section level2">
<h2>animals - Attributes of Animals</h2>
<div id="background-3" class="section level3">
<h3>Background</h3>
<blockquote>
<p>This data set considers 6 binary attributes for 20 animals.</p>
</blockquote>
<blockquote>
<p>see Struyf, Hubert &amp; Rousseeuw (1996), in agnes.</p>
</blockquote>
<p>Define variables:</p>
<div class="sourceCode" id="cb49"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" tabindex="-1"></a>animals <span class="ot">&lt;-</span> cluster<span class="sc">::</span>animals</span>
<span id="cb49-2"><a href="#cb49-2" tabindex="-1"></a></span>
<span id="cb49-3"><a href="#cb49-3" tabindex="-1"></a><span class="fu">colnames</span>(animals) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;warm-blooded&quot;</span>, </span>
<span id="cb49-4"><a href="#cb49-4" tabindex="-1"></a> <span class="st">&quot;can fly&quot;</span>,</span>
<span id="cb49-5"><a href="#cb49-5" tabindex="-1"></a> <span class="st">&quot;vertebrate&quot;</span>,</span>
<span id="cb49-6"><a href="#cb49-6" tabindex="-1"></a> <span class="st">&quot;endangered&quot;</span>,</span>
<span id="cb49-7"><a href="#cb49-7" tabindex="-1"></a> <span class="st">&quot;live in groups&quot;</span>,</span>
<span id="cb49-8"><a href="#cb49-8" tabindex="-1"></a> <span class="st">&quot;have hair&quot;</span>)</span></code></pre></div>
</div>
<div id="heatmap-1" class="section level3">
<h3>Heatmap</h3>
<p>This is a good example for using a heatmap + colored branches.</p>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" tabindex="-1"></a>dend_r <span class="ot">&lt;-</span> animals <span class="sc">%&gt;%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">&quot;man&quot;</span>) <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">&quot;ward.D&quot;</span>) <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span> ladderize <span class="sc">%&gt;%</span></span>
<span id="cb50-2"><a href="#cb50-2" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">4</span>)</span>
<span id="cb50-3"><a href="#cb50-3" tabindex="-1"></a></span>
<span id="cb50-4"><a href="#cb50-4" tabindex="-1"></a>dend_c <span class="ot">&lt;-</span> <span class="fu">t</span>(animals) <span class="sc">%&gt;%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">&quot;man&quot;</span>) <span class="sc">%&gt;%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">&quot;com&quot;</span>) <span class="sc">%&gt;%</span> as.dendrogram <span class="sc">%&gt;%</span> ladderize<span class="sc">%&gt;%</span></span>
<span id="cb50-5"><a href="#cb50-5" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>)</span>
<span id="cb50-6"><a href="#cb50-6" tabindex="-1"></a></span>
<span id="cb50-7"><a href="#cb50-7" tabindex="-1"></a></span>
<span id="cb50-8"><a href="#cb50-8" tabindex="-1"></a><span class="co"># some_col_func &lt;- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))</span></span>
<span id="cb50-9"><a href="#cb50-9" tabindex="-1"></a><span class="co"># some_col_func &lt;- colorspace::diverge_hcl</span></span>
<span id="cb50-10"><a href="#cb50-10" tabindex="-1"></a><span class="co"># some_col_func &lt;- colorspace::sequential_hcl</span></span>
<span id="cb50-11"><a href="#cb50-11" tabindex="-1"></a>some_col_func <span class="ot">&lt;-</span> <span class="cf">function</span>(n) (colorspace<span class="sc">::</span><span class="fu">diverge_hcl</span>(n, <span class="at">h =</span> <span class="fu">c</span>(<span class="dv">246</span>, <span class="dv">40</span>), <span class="at">c =</span> <span class="dv">96</span>, <span class="at">l =</span> <span class="fu">c</span>(<span class="dv">65</span>, <span class="dv">90</span>)))</span>
<span id="cb50-12"><a href="#cb50-12" tabindex="-1"></a></span>
<span id="cb50-13"><a href="#cb50-13" tabindex="-1"></a></span>
<span id="cb50-14"><a href="#cb50-14" tabindex="-1"></a></span>
<span id="cb50-15"><a href="#cb50-15" tabindex="-1"></a><span class="co"># par(mar = c(3,3,3,3))</span></span>
<span id="cb50-16"><a href="#cb50-16" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
<span id="cb50-17"><a href="#cb50-17" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(animals<span class="dv">-1</span>), </span>
<span id="cb50-18"><a href="#cb50-18" tabindex="-1"></a> <span class="at">main =</span> <span class="st">&quot;Attributes of Animals&quot;</span>,</span>
<span id="cb50-19"><a href="#cb50-19" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">35</span>,</span>
<span id="cb50-20"><a href="#cb50-20" tabindex="-1"></a> <span class="at">Rowv =</span> dend_r,</span>
<span id="cb50-21"><a href="#cb50-21" tabindex="-1"></a> <span class="at">Colv =</span> dend_c,</span>
<span id="cb50-22"><a href="#cb50-22" tabindex="-1"></a> <span class="at">trace=</span><span class="st">&quot;row&quot;</span>, <span class="at">hline =</span> <span class="cn">NA</span>, <span class="at">tracecol =</span> <span class="st">&quot;darkgrey&quot;</span>, </span>
<span id="cb50-23"><a href="#cb50-23" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">6</span>,<span class="dv">3</span>), </span>
<span id="cb50-24"><a href="#cb50-24" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">&quot;no / yes&quot;</span>,</span>
<span id="cb50-25"><a href="#cb50-25" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">&quot;grey&quot;</span>,</span>
<span id="cb50-26"><a href="#cb50-26" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">&quot;density&quot;</span>,</span>
<span id="cb50-27"><a href="#cb50-27" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
<span id="cb50-28"><a href="#cb50-28" tabindex="-1"></a> )</span></code></pre></div>
<p><img role="img" src="
<p>We see that we have several groups of variables: the “can fly” and
“endangered” (which usually are both “no”), the “have hair”, and the
“warm-blooded”, “vertebrate”, and “live in groups”.</p>
<p>We see that within the animals there are (roughly!) the following 4
groups:</p>
<ol style="list-style-type: decimal">
<li>The cold-blooded non-vertebrates, which are mostly not
endangered.</li>
<li>The warm-blooded vertebrates, which live in groups, have hair,
cannot fly, and mostly are not endangered.</li>
<li>The cold-blooded vertebrates, without hair, cannot fly, and are not
endangered.</li>
<li>The (mostly) warm-blooded vertebrates, without hair, some can fly,
and some are endangered.</li>
</ol>
<p>How much of a difference would we get if we used another clustering
algorithm?</p>
<p>We first calculate the clustering using 8 different methods:</p>
<div class="sourceCode" id="cb51"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" tabindex="-1"></a>hclust_methods <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;ward.D&quot;</span>, <span class="st">&quot;single&quot;</span>, <span class="st">&quot;complete&quot;</span>, <span class="st">&quot;average&quot;</span>, <span class="st">&quot;mcquitty&quot;</span>, </span>
<span id="cb51-2"><a href="#cb51-2" tabindex="-1"></a> <span class="st">&quot;median&quot;</span>, <span class="st">&quot;centroid&quot;</span>, <span class="st">&quot;ward.D2&quot;</span>)</span>
<span id="cb51-3"><a href="#cb51-3" tabindex="-1"></a>animals_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>()</span>
<span id="cb51-4"><a href="#cb51-4" tabindex="-1"></a></span>
<span id="cb51-5"><a href="#cb51-5" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
<span id="cb51-6"><a href="#cb51-6" tabindex="-1"></a> tmp_dend <span class="ot">&lt;-</span> animals <span class="sc">%&gt;%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">&quot;man&quot;</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb51-7"><a href="#cb51-7" tabindex="-1"></a> <span class="fu">hclust</span>(<span class="at">method =</span> hclust_methods[i]) <span class="sc">%&gt;%</span> as.dendrogram </span>
<span id="cb51-8"><a href="#cb51-8" tabindex="-1"></a> animals_dendlist <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(animals_dendlist, tmp_dend)</span>
<span id="cb51-9"><a href="#cb51-9" tabindex="-1"></a>}</span>
<span id="cb51-10"><a href="#cb51-10" tabindex="-1"></a><span class="fu">names</span>(animals_dendlist) <span class="ot">&lt;-</span> hclust_methods</span>
<span id="cb51-11"><a href="#cb51-11" tabindex="-1"></a><span class="co"># votes.repub_dendlist</span></span></code></pre></div>
<p>Next, we can look at the cophenetic correlation between each
clustering result using <code>cor.dendlist</code> (This can be nicely
plotted using the <code>corrplot</code> function from the
<em>corrplot</em> package):</p>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" tabindex="-1"></a>cophenetic_cors <span class="ot">&lt;-</span> <span class="fu">cor.dendlist</span>(animals_dendlist)</span>
<span id="cb52-2"><a href="#cb52-2" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(cophenetic_cors, <span class="st">&quot;pie&quot;</span>, <span class="st">&quot;lower&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We see that the different methods (other than ward.D and ward.D2),
all give quite different results. So would the above analysis be
different if we had used another clustering algorithm?</p>
<p>For this purpose, we compare the clustering solution of each
algorithm with one another, when cut to k=4 clusters, using the
Fowlkes-Mallows Index. This measure is similar to rand (or rand
adjusted) index, and gives a value of 1 when the two clusters conform,
and 0 when they do not:</p>
<div class="sourceCode" id="cb53"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb53-1"><a href="#cb53-1" tabindex="-1"></a>remove_median <span class="ot">&lt;-</span> <span class="fu">dendlist</span>(animals_dendlist, <span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">8</span>)[<span class="sc">-</span><span class="dv">6</span>] )</span>
<span id="cb53-2"><a href="#cb53-2" tabindex="-1"></a>FM_cors <span class="ot">&lt;-</span> <span class="fu">cor.dendlist</span>(remove_median, <span class="at">method =</span> <span class="st">&quot;FM_index&quot;</span>, <span class="at">k =</span> <span class="dv">4</span>)</span>
<span id="cb53-3"><a href="#cb53-3" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(FM_cors, <span class="st">&quot;pie&quot;</span>, <span class="st">&quot;lower&quot;</span>)</span></code></pre></div>
<p><img role="img" src="
<p>We removed the “median” method since it did not have k=4 possible. In
general, the results seems sensitive to the algorithm used, and the
different algorithm methods do not seem to agree with one another (with
regards to k=4), so further analyses may be in place in order to decide
on which algorithm and interpretation are most appropriate for these
data.</p>
<p>(Other possible data sets for the future: chorSub, flower,
plantTraits, pluton, ruspini, agriculture)</p>
</div>
</div>
<!-- code folding -->
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>