1145 lines
2.5 MiB
HTML
1145 lines
2.5 MiB
HTML
|
<!DOCTYPE html>
|
|||
|
|
|||
|
<html>
|
|||
|
|
|||
|
<head>
|
|||
|
|
|||
|
<meta charset="utf-8" />
|
|||
|
<meta name="generator" content="pandoc" />
|
|||
|
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
|||
|
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|||
|
|
|||
|
<meta name="author" content="Tal Galili" />
|
|||
|
|
|||
|
<meta name="date" content="2024-11-15" />
|
|||
|
|
|||
|
<title>Hierarchical cluster analysis on famous data sets - enhanced with the dendextend package</title>
|
|||
|
|
|||
|
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
|||
|
// be compatible with the behavior of Pandoc < 2.8).
|
|||
|
document.addEventListener('DOMContentLoaded', function(e) {
|
|||
|
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
|||
|
var i, h, a;
|
|||
|
for (i = 0; i < hs.length; i++) {
|
|||
|
h = hs[i];
|
|||
|
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
|||
|
a = h.attributes;
|
|||
|
while (a.length > 0) h.removeAttribute(a[0].name);
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code{white-space: pre-wrap;}
|
|||
|
span.smallcaps{font-variant: small-caps;}
|
|||
|
span.underline{text-decoration: underline;}
|
|||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
|||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
|||
|
ul.task-list{list-style: none;}
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code {
|
|||
|
white-space: pre;
|
|||
|
}
|
|||
|
.sourceCode {
|
|||
|
overflow: visible;
|
|||
|
}
|
|||
|
</style>
|
|||
|
<style type="text/css" data-origin="pandoc">
|
|||
|
pre > code.sourceCode { white-space: pre; position: relative; }
|
|||
|
pre > code.sourceCode > span { line-height: 1.25; }
|
|||
|
pre > code.sourceCode > span:empty { height: 1.2em; }
|
|||
|
.sourceCode { overflow: visible; }
|
|||
|
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
|||
|
div.sourceCode { margin: 1em 0; }
|
|||
|
pre.sourceCode { margin: 0; }
|
|||
|
@media screen {
|
|||
|
div.sourceCode { overflow: auto; }
|
|||
|
}
|
|||
|
@media print {
|
|||
|
pre > code.sourceCode { white-space: pre-wrap; }
|
|||
|
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
|
|||
|
}
|
|||
|
pre.numberSource code
|
|||
|
{ counter-reset: source-line 0; }
|
|||
|
pre.numberSource code > span
|
|||
|
{ position: relative; left: -4em; counter-increment: source-line; }
|
|||
|
pre.numberSource code > span > a:first-child::before
|
|||
|
{ content: counter(source-line);
|
|||
|
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
|||
|
border: none; display: inline-block;
|
|||
|
-webkit-touch-callout: none; -webkit-user-select: none;
|
|||
|
-khtml-user-select: none; -moz-user-select: none;
|
|||
|
-ms-user-select: none; user-select: none;
|
|||
|
padding: 0 4px; width: 4em;
|
|||
|
color: #aaaaaa;
|
|||
|
}
|
|||
|
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
|||
|
div.sourceCode
|
|||
|
{ }
|
|||
|
@media screen {
|
|||
|
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
|||
|
}
|
|||
|
code span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.at { color: #7d9029; }
|
|||
|
code span.bn { color: #40a070; }
|
|||
|
code span.bu { color: #008000; }
|
|||
|
code span.cf { color: #007020; font-weight: bold; }
|
|||
|
code span.ch { color: #4070a0; }
|
|||
|
code span.cn { color: #880000; }
|
|||
|
code span.co { color: #60a0b0; font-style: italic; }
|
|||
|
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.do { color: #ba2121; font-style: italic; }
|
|||
|
code span.dt { color: #902000; }
|
|||
|
code span.dv { color: #40a070; }
|
|||
|
code span.er { color: #ff0000; font-weight: bold; }
|
|||
|
code span.ex { }
|
|||
|
code span.fl { color: #40a070; }
|
|||
|
code span.fu { color: #06287e; }
|
|||
|
code span.im { color: #008000; font-weight: bold; }
|
|||
|
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.kw { color: #007020; font-weight: bold; }
|
|||
|
code span.op { color: #666666; }
|
|||
|
code span.ot { color: #007020; }
|
|||
|
code span.pp { color: #bc7a00; }
|
|||
|
code span.sc { color: #4070a0; }
|
|||
|
code span.ss { color: #bb6688; }
|
|||
|
code span.st { color: #4070a0; }
|
|||
|
code span.va { color: #19177c; }
|
|||
|
code span.vs { color: #4070a0; }
|
|||
|
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
</style>
|
|||
|
<script>
|
|||
|
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
|||
|
(function() {
|
|||
|
var sheets = document.styleSheets;
|
|||
|
for (var i = 0; i < sheets.length; i++) {
|
|||
|
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
|||
|
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
|||
|
var j = 0;
|
|||
|
while (j < rules.length) {
|
|||
|
var rule = rules[j];
|
|||
|
// check if there is a div.sourceCode rule
|
|||
|
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
var style = rule.style.cssText;
|
|||
|
// check if color or background-color is set
|
|||
|
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
// replace div.sourceCode by a pre.sourceCode rule
|
|||
|
sheets[i].deleteRule(j);
|
|||
|
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
|||
|
}
|
|||
|
}
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">body {
|
|||
|
background-color: #fff;
|
|||
|
margin: 1em auto;
|
|||
|
max-width: 700px;
|
|||
|
overflow: visible;
|
|||
|
padding-left: 2em;
|
|||
|
padding-right: 2em;
|
|||
|
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
|||
|
font-size: 14px;
|
|||
|
line-height: 1.35;
|
|||
|
}
|
|||
|
#TOC {
|
|||
|
clear: both;
|
|||
|
margin: 0 0 10px 10px;
|
|||
|
padding: 4px;
|
|||
|
width: 400px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
border-radius: 5px;
|
|||
|
background-color: #f6f6f6;
|
|||
|
font-size: 13px;
|
|||
|
line-height: 1.3;
|
|||
|
}
|
|||
|
#TOC .toctitle {
|
|||
|
font-weight: bold;
|
|||
|
font-size: 15px;
|
|||
|
margin-left: 5px;
|
|||
|
}
|
|||
|
#TOC ul {
|
|||
|
padding-left: 40px;
|
|||
|
margin-left: -1.5em;
|
|||
|
margin-top: 5px;
|
|||
|
margin-bottom: 5px;
|
|||
|
}
|
|||
|
#TOC ul ul {
|
|||
|
margin-left: -2em;
|
|||
|
}
|
|||
|
#TOC li {
|
|||
|
line-height: 16px;
|
|||
|
}
|
|||
|
table {
|
|||
|
margin: 1em auto;
|
|||
|
border-width: 1px;
|
|||
|
border-color: #DDDDDD;
|
|||
|
border-style: outset;
|
|||
|
border-collapse: collapse;
|
|||
|
}
|
|||
|
table th {
|
|||
|
border-width: 2px;
|
|||
|
padding: 5px;
|
|||
|
border-style: inset;
|
|||
|
}
|
|||
|
table td {
|
|||
|
border-width: 1px;
|
|||
|
border-style: inset;
|
|||
|
line-height: 18px;
|
|||
|
padding: 5px 5px;
|
|||
|
}
|
|||
|
table, table th, table td {
|
|||
|
border-left-style: none;
|
|||
|
border-right-style: none;
|
|||
|
}
|
|||
|
table thead, table tr.even {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
p {
|
|||
|
margin: 0.5em 0;
|
|||
|
}
|
|||
|
blockquote {
|
|||
|
background-color: #f6f6f6;
|
|||
|
padding: 0.25em 0.75em;
|
|||
|
}
|
|||
|
hr {
|
|||
|
border-style: solid;
|
|||
|
border: none;
|
|||
|
border-top: 1px solid #777;
|
|||
|
margin: 28px 0;
|
|||
|
}
|
|||
|
dl {
|
|||
|
margin-left: 0;
|
|||
|
}
|
|||
|
dl dd {
|
|||
|
margin-bottom: 13px;
|
|||
|
margin-left: 13px;
|
|||
|
}
|
|||
|
dl dt {
|
|||
|
font-weight: bold;
|
|||
|
}
|
|||
|
ul {
|
|||
|
margin-top: 0;
|
|||
|
}
|
|||
|
ul li {
|
|||
|
list-style: circle outside;
|
|||
|
}
|
|||
|
ul ul {
|
|||
|
margin-bottom: 0;
|
|||
|
}
|
|||
|
pre, code {
|
|||
|
background-color: #f7f7f7;
|
|||
|
border-radius: 3px;
|
|||
|
color: #333;
|
|||
|
white-space: pre-wrap;
|
|||
|
}
|
|||
|
pre {
|
|||
|
border-radius: 3px;
|
|||
|
margin: 5px 0px 10px 0px;
|
|||
|
padding: 10px;
|
|||
|
}
|
|||
|
pre:not([class]) {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
code {
|
|||
|
font-family: Consolas, Monaco, 'Courier New', monospace;
|
|||
|
font-size: 85%;
|
|||
|
}
|
|||
|
p > code, li > code {
|
|||
|
padding: 2px 0px;
|
|||
|
}
|
|||
|
div.figure {
|
|||
|
text-align: center;
|
|||
|
}
|
|||
|
img {
|
|||
|
background-color: #FFFFFF;
|
|||
|
padding: 2px;
|
|||
|
border: 1px solid #DDDDDD;
|
|||
|
border-radius: 3px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
margin: 0 5px;
|
|||
|
}
|
|||
|
h1 {
|
|||
|
margin-top: 0;
|
|||
|
font-size: 35px;
|
|||
|
line-height: 40px;
|
|||
|
}
|
|||
|
h2 {
|
|||
|
border-bottom: 4px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
padding-bottom: 2px;
|
|||
|
font-size: 145%;
|
|||
|
}
|
|||
|
h3 {
|
|||
|
border-bottom: 2px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
font-size: 120%;
|
|||
|
}
|
|||
|
h4 {
|
|||
|
border-bottom: 1px solid #f7f7f7;
|
|||
|
margin-left: 8px;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
h5, h6 {
|
|||
|
border-bottom: 1px solid #ccc;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
a {
|
|||
|
color: #0033dd;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
a:hover {
|
|||
|
color: #6666ff; }
|
|||
|
a:visited {
|
|||
|
color: #800080; }
|
|||
|
a:visited:hover {
|
|||
|
color: #BB00BB; }
|
|||
|
a[href^="http:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
a[href^="https:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
|
|||
|
code > span.kw { color: #555; font-weight: bold; }
|
|||
|
code > span.dt { color: #902000; }
|
|||
|
code > span.dv { color: #40a070; }
|
|||
|
code > span.bn { color: #d14; }
|
|||
|
code > span.fl { color: #d14; }
|
|||
|
code > span.ch { color: #d14; }
|
|||
|
code > span.st { color: #d14; }
|
|||
|
code > span.co { color: #888888; font-style: italic; }
|
|||
|
code > span.ot { color: #007020; }
|
|||
|
code > span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code > span.fu { color: #900; font-weight: bold; }
|
|||
|
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</head>
|
|||
|
|
|||
|
<body>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<h1 class="title toc-ignore">Hierarchical cluster analysis on famous
|
|||
|
data sets - enhanced with the <em>dendextend</em> package</h1>
|
|||
|
<h4 class="author">Tal Galili</h4>
|
|||
|
<h4 class="date">2024-11-15</h4>
|
|||
|
|
|||
|
|
|||
|
<div id="TOC">
|
|||
|
<ul>
|
|||
|
<li><a href="#introduction" id="toc-introduction">Introduction</a></li>
|
|||
|
<li><a href="#iris---edgar-andersons-iris-data" id="toc-iris---edgar-andersons-iris-data">iris - Edgar Anderson’s Iris
|
|||
|
Data</a>
|
|||
|
<ul>
|
|||
|
<li><a href="#background" id="toc-background">Background</a></li>
|
|||
|
<li><a href="#the-3-clusters-from-the-complete-method-vs-the-real-species-category" id="toc-the-3-clusters-from-the-complete-method-vs-the-real-species-category">The
|
|||
|
3 clusters from the “complete” method vs the real species
|
|||
|
category</a></li>
|
|||
|
<li><a href="#similaritydifference-between-various-clustering-algorithms" id="toc-similaritydifference-between-various-clustering-algorithms">Similarity/difference
|
|||
|
between various clustering algorithms</a></li>
|
|||
|
<li><a href="#clustering-prediction-of-the-3-species-classes" id="toc-clustering-prediction-of-the-3-species-classes">Clustering
|
|||
|
prediction of the 3 species classes</a></li>
|
|||
|
<li><a href="#conclusion" id="toc-conclusion">Conclusion</a></li>
|
|||
|
</ul></li>
|
|||
|
<li><a href="#khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes." id="toc-khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes.">khan
|
|||
|
- Microarray gene expression data set from Khan et al., 2001. Subset of
|
|||
|
306 genes.</a>
|
|||
|
<ul>
|
|||
|
<li><a href="#background-1" id="toc-background-1">Background</a></li>
|
|||
|
<li><a href="#comparing-the-train-vs-test-dendrograms" id="toc-comparing-the-train-vs-test-dendrograms">Comparing the train vs
|
|||
|
test dendrograms</a></li>
|
|||
|
<li><a href="#conclusion-1" id="toc-conclusion-1">Conclusion</a></li>
|
|||
|
</ul></li>
|
|||
|
<li><a href="#votes.repub---votes-for-republican-candidate-in-presidential-elections" id="toc-votes.repub---votes-for-republican-candidate-in-presidential-elections">votes.repub
|
|||
|
- Votes for Republican Candidate in Presidential Elections</a>
|
|||
|
<ul>
|
|||
|
<li><a href="#background-2" id="toc-background-2">Background</a></li>
|
|||
|
<li><a href="#heatmap" id="toc-heatmap">Heatmap</a></li>
|
|||
|
</ul></li>
|
|||
|
<li><a href="#animals---attributes-of-animals" id="toc-animals---attributes-of-animals">animals - Attributes of
|
|||
|
Animals</a>
|
|||
|
<ul>
|
|||
|
<li><a href="#background-3" id="toc-background-3">Background</a></li>
|
|||
|
<li><a href="#heatmap-1" id="toc-heatmap-1">Heatmap</a></li>
|
|||
|
</ul></li>
|
|||
|
</ul>
|
|||
|
</div>
|
|||
|
|
|||
|
<!--
|
|||
|
%\VignetteEngine{knitr::rmarkdown}
|
|||
|
%\VignetteIndexEntry{Hierarchical cluster analysis on famous data sets - enhanced with the _dendextend_ package}
|
|||
|
-->
|
|||
|
<div id="introduction" class="section level2">
|
|||
|
<h2>Introduction</h2>
|
|||
|
<p>This document demonstrates, on several famous data sets, how the
|
|||
|
<em>dendextend</em> R package can be used to enhance Hierarchical
|
|||
|
Cluster Analysis (through better visualization and sensitivity
|
|||
|
analysis).</p>
|
|||
|
</div>
|
|||
|
<div id="iris---edgar-andersons-iris-data" class="section level2">
|
|||
|
<h2>iris - Edgar Anderson’s Iris Data</h2>
|
|||
|
<div id="background" class="section level3">
|
|||
|
<h3>Background</h3>
|
|||
|
<blockquote>
|
|||
|
<p>The famous (Fisher’s or Anderson’s) iris data set gives the
|
|||
|
measurements in centimeters of the variables sepal length and width and
|
|||
|
petal length and width, respectively, for 50 flowers from each of 3
|
|||
|
species of iris. The species are Iris setosa, versicolor, and virginica.
|
|||
|
(from <code>?iris</code>)</p>
|
|||
|
</blockquote>
|
|||
|
<p>The <a href="https://en.wikipedia.org/wiki/Iris_flower_data_set">Iris
|
|||
|
flower data set</a> is fun for learning supervised classification
|
|||
|
algorithms, and is known as a difficult case for unsupervised learning.
|
|||
|
This is easily seen through the following Scatter Plot Matrix
|
|||
|
(SPLOM):</p>
|
|||
|
<p>Define variables:</p>
|
|||
|
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a>iris <span class="ot"><-</span> datasets<span class="sc">::</span>iris</span>
|
|||
|
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a>iris2 <span class="ot"><-</span> iris[,<span class="sc">-</span><span class="dv">5</span>]</span>
|
|||
|
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a>species_labels <span class="ot"><-</span> iris[,<span class="dv">5</span>]</span>
|
|||
|
<span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a><span class="fu">library</span>(colorspace) <span class="co"># get nice colors</span></span>
|
|||
|
<span id="cb1-5"><a href="#cb1-5" tabindex="-1"></a>species_col <span class="ot"><-</span> <span class="fu">rev</span>(<span class="fu">rainbow_hcl</span>(<span class="dv">3</span>))[<span class="fu">as.numeric</span>(species_labels)]</span></code></pre></div>
|
|||
|
<p>SPLOM:</p>
|
|||
|
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="co"># Plot a SPLOM:</span></span>
|
|||
|
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="fu">pairs</span>(iris2, <span class="at">col =</span> species_col,</span>
|
|||
|
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a> <span class="at">lower.panel =</span> <span class="cn">NULL</span>,</span>
|
|||
|
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a> <span class="at">cex.labels=</span><span class="dv">2</span>, <span class="at">pch=</span><span class="dv">19</span>, <span class="at">cex =</span> <span class="fl">1.2</span>)</span>
|
|||
|
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a></span>
|
|||
|
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a><span class="co"># Add a legend</span></span>
|
|||
|
<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">TRUE</span>)</span>
|
|||
|
<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="fu">legend</span>(<span class="at">x =</span> <span class="fl">0.05</span>, <span class="at">y =</span> <span class="fl">0.4</span>, <span class="at">cex =</span> <span class="dv">2</span>,</span>
|
|||
|
<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a> <span class="at">legend =</span> <span class="fu">as.character</span>(<span class="fu">levels</span>(species_labels)),</span>
|
|||
|
<span id="cb2-10"><a href="#cb2-10" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">unique</span>(species_col))</span>
|
|||
|
<span id="cb2-11"><a href="#cb2-11" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">NA</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can see that the <em>Setosa</em> species are distinctly different
|
|||
|
from <em>Versicolor</em> and <em>Virginica</em> (they have lower petal
|
|||
|
length and width). But <em>Versicolor</em> and <em>Virginica</em> cannot
|
|||
|
easily be separated based on measurements of their sepal and petal
|
|||
|
width/length.</p>
|
|||
|
<p>The same conclusion can be made by looking at the parallel
|
|||
|
coordinates plot of the data:</p>
|
|||
|
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="co"># http://blog.safaribooksonline.com/2014/03/31/mastering-parallel-coordinate-charts-r/</span></span>
|
|||
|
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="fu">par</span>(<span class="at">las =</span> <span class="dv">1</span>, <span class="at">mar =</span> <span class="fu">c</span>(<span class="fl">4.5</span>, <span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">2</span>) <span class="sc">+</span> <span class="fl">0.1</span>, <span class="at">cex =</span> .<span class="dv">8</span>)</span>
|
|||
|
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a>MASS<span class="sc">::</span><span class="fu">parcoord</span>(iris2, <span class="at">col =</span> species_col, <span class="at">var.label =</span> <span class="cn">TRUE</span>, <span class="at">lwd =</span> <span class="dv">2</span>)</span>
|
|||
|
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a><span class="co"># Add Title</span></span>
|
|||
|
<span id="cb3-6"><a href="#cb3-6" tabindex="-1"></a><span class="fu">title</span>(<span class="st">"Parallel coordinates plot of the Iris data"</span>)</span>
|
|||
|
<span id="cb3-7"><a href="#cb3-7" tabindex="-1"></a><span class="co"># Add a legend</span></span>
|
|||
|
<span id="cb3-8"><a href="#cb3-8" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">TRUE</span>)</span>
|
|||
|
<span id="cb3-9"><a href="#cb3-9" tabindex="-1"></a><span class="fu">legend</span>(<span class="at">x =</span> <span class="fl">1.75</span>, <span class="at">y =</span> <span class="sc">-</span>.<span class="dv">25</span>, <span class="at">cex =</span> <span class="dv">1</span>,</span>
|
|||
|
<span id="cb3-10"><a href="#cb3-10" tabindex="-1"></a> <span class="at">legend =</span> <span class="fu">as.character</span>(<span class="fu">levels</span>(species_labels)),</span>
|
|||
|
<span id="cb3-11"><a href="#cb3-11" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">unique</span>(species_col), <span class="at">horiz =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">par</span>(<span class="at">xpd =</span> <span class="cn">NA</span>)</span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="the-3-clusters-from-the-complete-method-vs-the-real-species-category" class="section level3">
|
|||
|
<h3>The 3 clusters from the “complete” method vs the real species
|
|||
|
category</h3>
|
|||
|
<p>The default hierarchical clustering method in <code>hclust</code> is
|
|||
|
“complete”. We can visualize the result of running it by turning the
|
|||
|
object to a dendrogram and making several adjustments to the object,
|
|||
|
such as: changing the labels, coloring the labels based on the real
|
|||
|
species category, and coloring the branches based on cutting the tree
|
|||
|
into three clusters.</p>
|
|||
|
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>d_iris <span class="ot"><-</span> <span class="fu">dist</span>(iris2) <span class="co"># method="man" # is a bit better</span></span>
|
|||
|
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a>hc_iris <span class="ot"><-</span> <span class="fu">hclust</span>(d_iris, <span class="at">method =</span> <span class="st">"complete"</span>)</span>
|
|||
|
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a>iris_species <span class="ot"><-</span> <span class="fu">rev</span>(<span class="fu">levels</span>(iris[,<span class="dv">5</span>]))</span>
|
|||
|
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="fu">library</span>(dendextend)</span>
|
|||
|
<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a>dend <span class="ot"><-</span> <span class="fu">as.dendrogram</span>(hc_iris)</span>
|
|||
|
<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a><span class="co"># order it the closest we can to the order of the observations:</span></span>
|
|||
|
<span id="cb5-8"><a href="#cb5-8" tabindex="-1"></a>dend <span class="ot"><-</span> <span class="fu">rotate</span>(dend, <span class="dv">1</span><span class="sc">:</span><span class="dv">150</span>)</span>
|
|||
|
<span id="cb5-9"><a href="#cb5-9" tabindex="-1"></a></span>
|
|||
|
<span id="cb5-10"><a href="#cb5-10" tabindex="-1"></a><span class="co"># Color the branches based on the clusters:</span></span>
|
|||
|
<span id="cb5-11"><a href="#cb5-11" tabindex="-1"></a>dend <span class="ot"><-</span> <span class="fu">color_branches</span>(dend, <span class="at">k=</span><span class="dv">3</span>) <span class="co">#, groupLabels=iris_species)</span></span>
|
|||
|
<span id="cb5-12"><a href="#cb5-12" tabindex="-1"></a></span>
|
|||
|
<span id="cb5-13"><a href="#cb5-13" tabindex="-1"></a><span class="co"># Manually match the labels, as much as possible, to the real classification of the flowers:</span></span>
|
|||
|
<span id="cb5-14"><a href="#cb5-14" tabindex="-1"></a><span class="fu">labels_colors</span>(dend) <span class="ot"><-</span></span>
|
|||
|
<span id="cb5-15"><a href="#cb5-15" tabindex="-1"></a> <span class="fu">rainbow_hcl</span>(<span class="dv">3</span>)[<span class="fu">sort_levels_values</span>(</span>
|
|||
|
<span id="cb5-16"><a href="#cb5-16" tabindex="-1"></a> <span class="fu">as.numeric</span>(iris[,<span class="dv">5</span>])[<span class="fu">order.dendrogram</span>(dend)]</span>
|
|||
|
<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a> )]</span>
|
|||
|
<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a></span>
|
|||
|
<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a><span class="co"># We shall add the flower type to the labels:</span></span>
|
|||
|
<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a><span class="fu">labels</span>(dend) <span class="ot"><-</span> <span class="fu">paste</span>(<span class="fu">as.character</span>(iris[,<span class="dv">5</span>])[<span class="fu">order.dendrogram</span>(dend)],</span>
|
|||
|
<span id="cb5-21"><a href="#cb5-21" tabindex="-1"></a> <span class="st">"("</span>,<span class="fu">labels</span>(dend),<span class="st">")"</span>, </span>
|
|||
|
<span id="cb5-22"><a href="#cb5-22" tabindex="-1"></a> <span class="at">sep =</span> <span class="st">""</span>)</span>
|
|||
|
<span id="cb5-23"><a href="#cb5-23" tabindex="-1"></a><span class="co"># We hang the dendrogram a bit:</span></span>
|
|||
|
<span id="cb5-24"><a href="#cb5-24" tabindex="-1"></a>dend <span class="ot"><-</span> <span class="fu">hang.dendrogram</span>(dend,<span class="at">hang_height=</span><span class="fl">0.1</span>)</span>
|
|||
|
<span id="cb5-25"><a href="#cb5-25" tabindex="-1"></a><span class="co"># reduce the size of the labels:</span></span>
|
|||
|
<span id="cb5-26"><a href="#cb5-26" tabindex="-1"></a><span class="co"># dend <- assign_values_to_leaves_nodePar(dend, 0.5, "lab.cex")</span></span>
|
|||
|
<span id="cb5-27"><a href="#cb5-27" tabindex="-1"></a>dend <span class="ot"><-</span> <span class="fu">set</span>(dend, <span class="st">"labels_cex"</span>, <span class="fl">0.5</span>)</span>
|
|||
|
<span id="cb5-28"><a href="#cb5-28" tabindex="-1"></a><span class="co"># And plot:</span></span>
|
|||
|
<span id="cb5-29"><a href="#cb5-29" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mar =</span> <span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">3</span>,<span class="dv">3</span>,<span class="dv">7</span>))</span>
|
|||
|
<span id="cb5-30"><a href="#cb5-30" tabindex="-1"></a><span class="fu">plot</span>(dend, </span>
|
|||
|
<span id="cb5-31"><a href="#cb5-31" tabindex="-1"></a> <span class="at">main =</span> <span class="st">"Clustered Iris data set</span></span>
|
|||
|
<span id="cb5-32"><a href="#cb5-32" tabindex="-1"></a><span class="st"> (the labels give the true flower species)"</span>, </span>
|
|||
|
<span id="cb5-33"><a href="#cb5-33" tabindex="-1"></a> <span class="at">horiz =</span> <span class="cn">TRUE</span>, <span class="at">nodePar =</span> <span class="fu">list</span>(<span class="at">cex =</span> .<span class="dv">007</span>))</span>
|
|||
|
<span id="cb5-34"><a href="#cb5-34" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">"topleft"</span>, <span class="at">legend =</span> iris_species, <span class="at">fill =</span> <span class="fu">rainbow_hcl</span>(<span class="dv">3</span>))</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="do">#### BTW, notice that:</span></span>
|
|||
|
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co"># labels(hc_iris) # no labels, because "iris" has no row names</span></span>
|
|||
|
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a><span class="co"># is.integer(labels(dend)) # this could cause problems...</span></span>
|
|||
|
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a><span class="co"># is.character(labels(dend)) # labels are no longer "integer"</span></span></code></pre></div>
|
|||
|
<p>The same can be presented in a circular layout:</p>
|
|||
|
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a><span class="co"># Requires that the circlize package will be installed</span></span>
|
|||
|
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mar =</span> <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">4</span>))</span>
|
|||
|
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a><span class="fu">circlize_dendrogram</span>(dend)</span></code></pre></div>
|
|||
|
<pre><code>## Loading required namespace: circlize</code></pre>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>These visualizations easily demonstrates how the separation of the
|
|||
|
hierarchical clustering is very good with the “Setosa” species, but
|
|||
|
misses in labeling many “Versicolor” species as “Virginica”.</p>
|
|||
|
<p>The hanging of the tree also helps to locate extreme observations.
|
|||
|
For example, we can see that observation “virginica (107)” is not very
|
|||
|
similar to the Versicolor species, but still, it is among them. Also,
|
|||
|
“Versicolor (71)” is located too much “within” the group of Virginica
|
|||
|
flowers.</p>
|
|||
|
<p>We can also explore the data using a heatmap. The rows are ordered
|
|||
|
based on the order of the hierarchical clustering (using the “complete”
|
|||
|
method). The colored bar indicates the species category each row belongs
|
|||
|
to. The color in the heatmap indicates the length of each measurement
|
|||
|
(from light yellow to dark red).</p>
|
|||
|
<p>In the heatmap we also see how the Setosa species has low petal
|
|||
|
values (in light yellow), but it is very difficult to see any clear
|
|||
|
distinction between the other two species.</p>
|
|||
|
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>some_col_func <span class="ot"><-</span> <span class="cf">function</span>(n) <span class="fu">rev</span>(colorspace<span class="sc">::</span><span class="fu">heat_hcl</span>(n, <span class="at">c =</span> <span class="fu">c</span>(<span class="dv">80</span>, <span class="dv">30</span>), <span class="at">l =</span> <span class="fu">c</span>(<span class="dv">30</span>, <span class="dv">90</span>), <span class="at">power =</span> <span class="fu">c</span>(<span class="dv">1</span><span class="sc">/</span><span class="dv">5</span>, <span class="fl">1.5</span>)))</span>
|
|||
|
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a><span class="co"># scaled_iris2 <- iris2 %>% as.matrix %>% scale</span></span>
|
|||
|
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
|
|||
|
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(iris2), </span>
|
|||
|
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a> <span class="at">main =</span> <span class="st">"Heatmap for the Iris data set"</span>,</span>
|
|||
|
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">20</span>,</span>
|
|||
|
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">"row"</span>,</span>
|
|||
|
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a> <span class="at">Rowv =</span> dend,</span>
|
|||
|
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a> <span class="at">Colv =</span> <span class="st">"NA"</span>, <span class="co"># this to make sure the columns are not ordered</span></span>
|
|||
|
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a> <span class="at">trace=</span><span class="st">"none"</span>, </span>
|
|||
|
<span id="cb9-12"><a href="#cb9-12" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">5</span>,<span class="fl">0.1</span>), </span>
|
|||
|
<span id="cb9-13"><a href="#cb9-13" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">"Cm"</span>,</span>
|
|||
|
<span id="cb9-14"><a href="#cb9-14" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">"grey"</span>,</span>
|
|||
|
<span id="cb9-15"><a href="#cb9-15" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">"density"</span>,</span>
|
|||
|
<span id="cb9-16"><a href="#cb9-16" tabindex="-1"></a> <span class="at">RowSideColors =</span> <span class="fu">rev</span>(<span class="fu">labels_colors</span>(dend)), <span class="co"># to add nice colored strips </span></span>
|
|||
|
<span id="cb9-17"><a href="#cb9-17" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
|
|||
|
<span id="cb9-18"><a href="#cb9-18" tabindex="-1"></a> )</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can get an interactive heatmap by using the <code>heatmaply</code>
|
|||
|
package/function: (code is not evaluated in order to keep the HTML
|
|||
|
size)</p>
|
|||
|
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>heatmaply<span class="sc">::</span><span class="fu">heatmaply</span>(<span class="fu">as.matrix</span>(iris2),</span>
|
|||
|
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">"row"</span>,</span>
|
|||
|
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a> <span class="at">Rowv =</span> dend)</span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="similaritydifference-between-various-clustering-algorithms" class="section level3">
|
|||
|
<h3>Similarity/difference between various clustering algorithms</h3>
|
|||
|
<p>We may ask ourselves how many different results we could get if we
|
|||
|
would use different clustering algorithms (<code>hclust</code> has 8
|
|||
|
different algorithms implemented). For the purpose of this analysis, we
|
|||
|
will create all 8 hclust objects, and chain them together into a single
|
|||
|
<code>dendlist</code> object (which, as the name implies, can hold a
|
|||
|
bunch of dendrograms together for the purpose of further analysis).</p>
|
|||
|
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>hclust_methods <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"ward.D"</span>, <span class="st">"single"</span>, <span class="st">"complete"</span>, <span class="st">"average"</span>, <span class="st">"mcquitty"</span>, </span>
|
|||
|
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a> <span class="st">"median"</span>, <span class="st">"centroid"</span>, <span class="st">"ward.D2"</span>)</span>
|
|||
|
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a>iris_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>()</span>
|
|||
|
<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
|
|||
|
<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a> hc_iris <span class="ot"><-</span> <span class="fu">hclust</span>(d_iris, <span class="at">method =</span> hclust_methods[i]) </span>
|
|||
|
<span id="cb11-6"><a href="#cb11-6" tabindex="-1"></a> iris_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>(iris_dendlist, <span class="fu">as.dendrogram</span>(hc_iris))</span>
|
|||
|
<span id="cb11-7"><a href="#cb11-7" tabindex="-1"></a>}</span>
|
|||
|
<span id="cb11-8"><a href="#cb11-8" tabindex="-1"></a><span class="fu">names</span>(iris_dendlist) <span class="ot"><-</span> hclust_methods</span>
|
|||
|
<span id="cb11-9"><a href="#cb11-9" tabindex="-1"></a>iris_dendlist</span></code></pre></div>
|
|||
|
<pre><code>## $ward.D
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 199.6205
|
|||
|
##
|
|||
|
## $single
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 1.640122
|
|||
|
##
|
|||
|
## $complete
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 7.085196
|
|||
|
##
|
|||
|
## $average
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 4.062683
|
|||
|
##
|
|||
|
## $mcquitty
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 4.497283
|
|||
|
##
|
|||
|
## $median
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 2.82744
|
|||
|
##
|
|||
|
## $centroid
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 2.994307
|
|||
|
##
|
|||
|
## $ward.D2
|
|||
|
## 'dendrogram' with 2 branches and 150 members total, at height 32.44761
|
|||
|
##
|
|||
|
## attr(,"class")
|
|||
|
## [1] "dendlist"</code></pre>
|
|||
|
<p>Next, we can look at the cophenetic correlation between each
|
|||
|
clustering result using <code>cor.dendlist</code>. (This can be nicely
|
|||
|
plotted using the <code>corrplot</code> function from the
|
|||
|
<em>corrplot</em> package):</p>
|
|||
|
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a>iris_dendlist_cor <span class="ot"><-</span> <span class="fu">cor.dendlist</span>(iris_dendlist)</span>
|
|||
|
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a>iris_dendlist_cor</span></code></pre></div>
|
|||
|
<pre><code>## ward.D single complete average mcquitty median centroid
|
|||
|
## ward.D 1.0000000 0.9836838 0.5774013 0.9841333 0.9641103 0.9451815 0.9809088
|
|||
|
## single 0.9836838 1.0000000 0.5665529 0.9681156 0.9329029 0.9444723 0.9903934
|
|||
|
## complete 0.5774013 0.5665529 1.0000000 0.6195121 0.6107473 0.6889092 0.5870062
|
|||
|
## average 0.9841333 0.9681156 0.6195121 1.0000000 0.9828015 0.9449422 0.9801444
|
|||
|
## mcquitty 0.9641103 0.9329029 0.6107473 0.9828015 1.0000000 0.9203374 0.9499123
|
|||
|
## median 0.9451815 0.9444723 0.6889092 0.9449422 0.9203374 1.0000000 0.9403569
|
|||
|
## centroid 0.9809088 0.9903934 0.5870062 0.9801444 0.9499123 0.9403569 1.0000000
|
|||
|
## ward.D2 0.9911648 0.9682507 0.6096286 0.9895131 0.9829977 0.9445832 0.9737886
|
|||
|
## ward.D2
|
|||
|
## ward.D 0.9911648
|
|||
|
## single 0.9682507
|
|||
|
## complete 0.6096286
|
|||
|
## average 0.9895131
|
|||
|
## mcquitty 0.9829977
|
|||
|
## median 0.9445832
|
|||
|
## centroid 0.9737886
|
|||
|
## ward.D2 1.0000000</code></pre>
|
|||
|
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(iris_dendlist_cor, <span class="st">"pie"</span>, <span class="st">"lower"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>From the above figure, we can easily see that most clustering methods
|
|||
|
yield very similar results, except for the complete method (the default
|
|||
|
method in <code>hclust</code>), which yields a correlation measure of
|
|||
|
around 0.6.</p>
|
|||
|
<p>The default cophenetic correlation uses pearson’s measure, but what
|
|||
|
if we use the spearman’s correlation coefficient?</p>
|
|||
|
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>iris_dendlist_cor_spearman <span class="ot"><-</span> <span class="fu">cor.dendlist</span>(iris_dendlist, <span class="at">method_coef =</span> <span class="st">"spearman"</span>)</span>
|
|||
|
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(iris_dendlist_cor_spearman, <span class="st">"pie"</span>, <span class="st">"lower"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can see that the correlations are not so strong, indicating a
|
|||
|
behavior that is dependent on some items which are very distant from one
|
|||
|
another having an influence on the pearson’s correlation more than that
|
|||
|
of the spearman’s correlation.</p>
|
|||
|
<p>To further explore the similarity and difference between the
|
|||
|
alternative clustering algorithms, we can turn to using the
|
|||
|
<code>tanglegram</code> function (which works for either two
|
|||
|
<code>dendrogram</code>s, or a <code>dendlist</code>).</p>
|
|||
|
<p>First, let us see two methods which are very similar: ward.D vs
|
|||
|
ward.D2. From a first glance, we can see how they both give the same
|
|||
|
result for the top 3 clusters. However, since they are both ladderizes
|
|||
|
(i.e.: having their smaller branch rotated to be higher for each node),
|
|||
|
we can see that their clustering is not identical (due to the
|
|||
|
crossings).</p>
|
|||
|
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
|
|||
|
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a>iris_dendlist <span class="sc">%>%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">8</span>)) <span class="sc">%>%</span> ladderize <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">"branches_k_color"</span>, <span class="at">k=</span><span class="dv">3</span>) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a> <span class="co"># untangle(method = "step1side", k_seq = 3:20) %>%</span></span>
|
|||
|
<span id="cb17-5"><a href="#cb17-5" tabindex="-1"></a> <span class="co"># set("clear_branches") %>% #otherwise the single lines are not black, since they retain the previous color from the branches_k_color.</span></span>
|
|||
|
<span id="cb17-6"><a href="#cb17-6" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>Next, let us look at two methods which also have a high cophenetic
|
|||
|
correlation: ward.D vs the average:</p>
|
|||
|
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
|
|||
|
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a>iris_dendlist <span class="sc">%>%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">4</span>)) <span class="sc">%>%</span> ladderize <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">"branches_k_color"</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a> <span class="co"># untangle(method = "step1side", k_seq = 3:20) %>%</span></span>
|
|||
|
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We see that when it comes to the major clusters, the two algorithms
|
|||
|
perform quite similarly.</p>
|
|||
|
<p>However, how are they doing inside each of the clusters? It is quite
|
|||
|
difficult to compare the two because of the high value in ward.D. For
|
|||
|
comparison purposes, we can “rank” the heights of the branches in the
|
|||
|
two dendrograms (while still preserving their internal order). Next, we
|
|||
|
can highlight the shared common sub-trees (with different colors), and
|
|||
|
the distinct edges (with a dashed line):</p>
|
|||
|
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="co"># The `which` parameter allows us to pick the elements in the list to compare</span></span>
|
|||
|
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>iris_dendlist <span class="sc">%>%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">4</span>)) <span class="sc">%>%</span> ladderize <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a> <span class="co"># untangle(method = "step1side", k_seq = 3:20) %>%</span></span>
|
|||
|
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">"rank_branches"</span>) <span class="sc">%>%</span></span>
|
|||
|
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">common_subtrees_color_branches =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We have 39 sub-trees that are identical between the two
|
|||
|
dendrograms:</p>
|
|||
|
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a><span class="fu">length</span>(<span class="fu">unique</span>(<span class="fu">common_subtrees_clusters</span>(iris_dendlist[[<span class="dv">1</span>]], iris_dendlist[[<span class="dv">4</span>]]))[<span class="sc">-</span><span class="dv">1</span>])</span></code></pre></div>
|
|||
|
<pre><code>## [1] 39</code></pre>
|
|||
|
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a><span class="co"># -1 at the end is because we are ignoring the "0" subtree, which indicates leaves that are singletons.</span></span></code></pre></div>
|
|||
|
<p>What we can learn from this is that actually the two algorithms seem
|
|||
|
to give quite different results in the high resolution (higher cuts).
|
|||
|
However, since both capture the two major clusters (Setosa vs the
|
|||
|
others), they are considered quite similar by the cophenetic
|
|||
|
correlation.</p>
|
|||
|
<p>But what about the “complete” method (that got a lower cophenetic
|
|||
|
correlation than the other methods)? When we compare “complete” vs
|
|||
|
“average”, we can quickly see that in the “complete” method, the
|
|||
|
splitting of the clusters is much more balanced, and mixes the “Setosa”
|
|||
|
species with another one. This is probably the cause for the big
|
|||
|
difference found in the cophenetic correlation between the “complete
|
|||
|
method” and the other clustering methods:</p>
|
|||
|
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a>iris_dendlist <span class="sc">%>%</span> <span class="fu">dendlist</span>(<span class="at">which =</span> <span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">4</span>)) <span class="sc">%>%</span> ladderize <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a> <span class="fu">untangle</span>(<span class="at">method =</span> <span class="st">"step1side"</span>, <span class="at">k_seq =</span> <span class="dv">2</span><span class="sc">:</span><span class="dv">6</span>) <span class="sc">%>%</span></span>
|
|||
|
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">"branches_k_color"</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a> <span class="fu">tanglegram</span>(<span class="at">faster =</span> <span class="cn">TRUE</span>) <span class="co"># (common_subtrees_color_branches = TRUE)</span></span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can quickly plot all 8 methods to see this phenomenon (i.e.: that
|
|||
|
“complete” has its smaller cluster larger than it is in all the other
|
|||
|
clustering methods):</p>
|
|||
|
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow =</span> <span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>))</span>
|
|||
|
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">8</span>) {</span>
|
|||
|
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a> iris_dendlist[[i]] <span class="sc">%>%</span> <span class="fu">set</span>(<span class="st">"branches_k_color"</span>, <span class="at">k=</span><span class="dv">2</span>) <span class="sc">%>%</span> <span class="fu">plot</span>(<span class="at">axes =</span> <span class="cn">FALSE</span>, <span class="at">horiz =</span> <span class="cn">TRUE</span>)</span>
|
|||
|
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a> <span class="fu">title</span>(<span class="fu">names</span>(iris_dendlist)[i])</span>
|
|||
|
<span id="cb24-5"><a href="#cb24-5" tabindex="-1"></a>}</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>It seems that the cophenetic correlation is very biased towards the
|
|||
|
influence of the main clusters. Another correlation measure to use is
|
|||
|
the <code>cor_common_nodes</code> correlation (giving the proportion of
|
|||
|
nodes which share the exact same list of labels in both dendrograms). We
|
|||
|
can also check it out:</p>
|
|||
|
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a>iris_dendlist_cor2 <span class="ot"><-</span> <span class="fu">cor.dendlist</span>(iris_dendlist, <span class="at">method =</span> <span class="st">"common"</span>)</span>
|
|||
|
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a>iris_dendlist_cor2</span></code></pre></div>
|
|||
|
<pre><code>## ward.D single complete average mcquitty median centroid
|
|||
|
## ward.D 1.0000000 0.7324415 0.8595318 0.8461538 0.8361204 0.7458194 0.7324415
|
|||
|
## single 0.7324415 1.0000000 0.7324415 0.7491639 0.7458194 0.7591973 0.7625418
|
|||
|
## complete 0.8595318 0.7324415 1.0000000 0.8060201 0.7993311 0.7491639 0.7290970
|
|||
|
## average 0.8461538 0.7491639 0.8060201 1.0000000 0.8494983 0.7892977 0.7725753
|
|||
|
## mcquitty 0.8361204 0.7458194 0.7993311 0.8494983 1.0000000 0.7859532 0.7759197
|
|||
|
## median 0.7458194 0.7591973 0.7491639 0.7892977 0.7859532 1.0000000 0.8528428
|
|||
|
## centroid 0.7324415 0.7625418 0.7290970 0.7725753 0.7759197 0.8528428 1.0000000
|
|||
|
## ward.D2 0.8795987 0.7324415 0.8294314 0.8294314 0.8294314 0.7558528 0.7357860
|
|||
|
## ward.D2
|
|||
|
## ward.D 0.8795987
|
|||
|
## single 0.7324415
|
|||
|
## complete 0.8294314
|
|||
|
## average 0.8294314
|
|||
|
## mcquitty 0.8294314
|
|||
|
## median 0.7558528
|
|||
|
## centroid 0.7357860
|
|||
|
## ward.D2 1.0000000</code></pre>
|
|||
|
<p>And plot it:</p>
|
|||
|
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a><span class="co"># corrplot::corrplot(iris_dendlist_cor2, "pie", "lower")</span></span></code></pre></div>
|
|||
|
<p>This gives us another perspective on our clustering algorithms. We
|
|||
|
can see that most methods have around 75% common nodes with one another.
|
|||
|
Centroid and median seem relatively close to one another, as well as
|
|||
|
ward.D2 and ward.D to one another and to complete, average, and mcquitty
|
|||
|
(as compared to the other methods).</p>
|
|||
|
</div>
|
|||
|
<div id="clustering-prediction-of-the-3-species-classes" class="section level3">
|
|||
|
<h3>Clustering prediction of the 3 species classes</h3>
|
|||
|
<p>Lastly, we would like to see which of the different clustering
|
|||
|
algorithms came the closest to detecting the 3 flower species (when
|
|||
|
using a cut of k=3).</p>
|
|||
|
<p>For this purpose, we compare the clustering solution of each
|
|||
|
algorithm with the real clusters, using the Fowlkes-Mallows Index (also
|
|||
|
using in the package for the <code>Bk_plot</code>). This measure is
|
|||
|
similar to rand (or rand adjusted) index, and gives a value of 1 when
|
|||
|
the two clusters confirm, and 0 when they do not.</p>
|
|||
|
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a>get_ordered_3_clusters <span class="ot"><-</span> <span class="cf">function</span>(dend) {</span>
|
|||
|
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a> <span class="fu">cutree</span>(dend, <span class="at">k =</span> <span class="dv">3</span>)[<span class="fu">order.dendrogram</span>(dend)]</span>
|
|||
|
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a>}</span>
|
|||
|
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-5"><a href="#cb28-5" tabindex="-1"></a>dend_3_clusters <span class="ot"><-</span> <span class="fu">lapply</span>(iris_dendlist, get_ordered_3_clusters)</span>
|
|||
|
<span id="cb28-6"><a href="#cb28-6" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-7"><a href="#cb28-7" tabindex="-1"></a>compare_clusters_to_iris <span class="ot"><-</span> <span class="cf">function</span>(clus) {<span class="fu">FM_index</span>(clus, <span class="fu">rep</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>, <span class="at">each =</span> <span class="dv">50</span>), <span class="at">assume_sorted_vectors =</span> <span class="cn">TRUE</span>)}</span>
|
|||
|
<span id="cb28-8"><a href="#cb28-8" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-9"><a href="#cb28-9" tabindex="-1"></a>clusters_performance <span class="ot"><-</span> <span class="fu">sapply</span>(dend_3_clusters, compare_clusters_to_iris)</span>
|
|||
|
<span id="cb28-10"><a href="#cb28-10" tabindex="-1"></a><span class="fu">dotchart</span>(<span class="fu">sort</span>(clusters_performance), <span class="at">xlim =</span> <span class="fu">c</span>(<span class="fl">0.7</span>,<span class="dv">1</span>),</span>
|
|||
|
<span id="cb28-11"><a href="#cb28-11" tabindex="-1"></a> <span class="at">xlab =</span> <span class="st">"Fowlkes-Mallows Index (from 0 to 1)"</span>,</span>
|
|||
|
<span id="cb28-12"><a href="#cb28-12" tabindex="-1"></a> <span class="at">main =</span> <span class="st">"Perormance of clustering algorithms </span><span class="sc">\n</span><span class="st"> in detecting the 3 species"</span>,</span>
|
|||
|
<span id="cb28-13"><a href="#cb28-13" tabindex="-1"></a> <span class="at">pch =</span> <span class="dv">19</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can see that the “median” method did the best, although similar
|
|||
|
results were achieved by ward.D2, average, ward.D, and mcquitty.
|
|||
|
However, the complete, centroid, and single method did worse in our
|
|||
|
case.</p>
|
|||
|
</div>
|
|||
|
<div id="conclusion" class="section level3">
|
|||
|
<h3>Conclusion</h3>
|
|||
|
<p>The Iris data set is only 4-dimensional, making it possible to
|
|||
|
explore using pairs plot (SPLOM) or parallel coordinates plot. It is
|
|||
|
clear from these that two main clusters are visible, while the
|
|||
|
separation of the third cluster is difficult.</p>
|
|||
|
<p>In the above analysis, we learned that the complete method fails to
|
|||
|
do the proper separation of the two main clusters when cut in k=2 (but
|
|||
|
succeeds in doing it, if moving to k=3 clusters). This is different from
|
|||
|
all the other 7 methods available in <code>hclust</code>, which do
|
|||
|
succeed in separating the 2 main clusters from the beginning (i.e.: for
|
|||
|
k=2).</p>
|
|||
|
<p>We also noticed that all clustering algorithms share a relatively
|
|||
|
high proportion of common nodes (between 75% to 90%).</p>
|
|||
|
<p>Lastly, when it came to trying to separating the flowers into 3
|
|||
|
species, the median clustering method did the best, while the single
|
|||
|
method did the worst in this regard.</p>
|
|||
|
<p>While the Iris data set is well known, I hope the above analysis was
|
|||
|
able to offer some new perspectives on the performance of the different
|
|||
|
hierarchical clustering methods.</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div id="khan---microarray-gene-expression-data-set-from-khan-et-al.-2001.-subset-of-306-genes." class="section level2">
|
|||
|
<h2>khan - Microarray gene expression data set from Khan et al., 2001.
|
|||
|
Subset of 306 genes.</h2>
|
|||
|
<div id="background-1" class="section level3">
|
|||
|
<h3>Background</h3>
|
|||
|
<blockquote>
|
|||
|
<p>Khan contains gene expression profiles of four types of small, round,
|
|||
|
blue cell tumors of childhood (SRBCT) published by Khan et al. (2001).
|
|||
|
It also contains further gene annotation retrieved from SOURCE at <a href="http://source.stanford.edu/" class="uri">http://source.stanford.edu/</a>.</p>
|
|||
|
</blockquote>
|
|||
|
<p>This interesting data set offers two interesting items:</p>
|
|||
|
<ul>
|
|||
|
<li>train: data.frame of 306 rows and 64 columns. The training data set
|
|||
|
of 64 arrays and 306 gene expression values</li>
|
|||
|
<li>test: data.frame, of 306 rows and 25 columns. The test data set of
|
|||
|
25 arrays and 306 genes expression values</li>
|
|||
|
</ul>
|
|||
|
<p>This way we can create a hierarchical clustering on the 306 genes
|
|||
|
expression values on the train and the test data and compare the two to
|
|||
|
see the stability of the results.</p>
|
|||
|
<p>We define the variables:</p>
|
|||
|
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" tabindex="-1"></a>train <span class="ot"><-</span> dendextend<span class="sc">::</span>khan<span class="sc">$</span>train</span>
|
|||
|
<span id="cb29-2"><a href="#cb29-2" tabindex="-1"></a>test <span class="ot"><-</span> dendextend<span class="sc">::</span>khan<span class="sc">$</span>test</span></code></pre></div>
|
|||
|
<p>And create the dendrograms:</p>
|
|||
|
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" tabindex="-1"></a>d_train <span class="ot"><-</span> train <span class="sc">%>%</span> dist <span class="sc">%>%</span> hclust <span class="sc">%>%</span> as.dendrogram</span>
|
|||
|
<span id="cb30-2"><a href="#cb30-2" tabindex="-1"></a>d_test <span class="ot"><-</span> test <span class="sc">%>%</span> dist <span class="sc">%>%</span> hclust <span class="sc">%>%</span> as.dendrogram</span>
|
|||
|
<span id="cb30-3"><a href="#cb30-3" tabindex="-1"></a>d_train_test <span class="ot"><-</span> <span class="fu">dendlist</span>(<span class="at">train =</span> d_train, <span class="at">test =</span> d_test)</span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="comparing-the-train-vs-test-dendrograms" class="section level3">
|
|||
|
<h3>Comparing the train vs test dendrograms</h3>
|
|||
|
<p>Using a cophenetic correlation, we can see the two trees have some
|
|||
|
similarity (0.57):</p>
|
|||
|
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" tabindex="-1"></a>d_train_test <span class="sc">%>%</span> cor.dendlist</span></code></pre></div>
|
|||
|
<pre><code>## train test
|
|||
|
## train 1.0000000 0.5708019
|
|||
|
## test 0.5708019 1.0000000</code></pre>
|
|||
|
<p>However, when looking at the cophenetic correlation with the spearman
|
|||
|
correlation coefficiant, the value is lower (0.49) indicating that some
|
|||
|
of the similarity is due to a small number of items, distant from the
|
|||
|
others, which are correlated similarly in the two trees:</p>
|
|||
|
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" tabindex="-1"></a>d_train_test <span class="sc">%>%</span> <span class="fu">cor.dendlist</span>(<span class="at">method_coef =</span> <span class="st">"spearman"</span>)</span></code></pre></div>
|
|||
|
<pre><code>## train test
|
|||
|
## train 1.0000000 0.4971936
|
|||
|
## test 0.4971936 1.0000000</code></pre>
|
|||
|
<p>We may ask at which level of cutting the dendrogram we get the “best”
|
|||
|
level of similarity. For this we may turn to the Bk plot. The plots
|
|||
|
shows us that at around 7 clusters the groups in the two are starting to
|
|||
|
look significantly similar. (Note that significantly does not mean
|
|||
|
substantially)</p>
|
|||
|
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" tabindex="-1"></a><span class="fu">Bk_plot</span>(d_train, d_test, <span class="at">k =</span> <span class="dv">2</span><span class="sc">:</span><span class="dv">30</span>, <span class="at">xlim =</span> <span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">30</span>))</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>Next, we compare the results with a tanglegram. We make sure to color
|
|||
|
the connecting line with the colors of the branches of the train (left)
|
|||
|
dendrogram. This can help us see which patterns are somewhat preserved
|
|||
|
between the two trees.</p>
|
|||
|
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" tabindex="-1"></a>pre_tang_d_train_test <span class="ot"><-</span> d_train_test <span class="sc">%>%</span> ladderize <span class="sc">%>%</span> <span class="co"># untangle %>%</span></span>
|
|||
|
<span id="cb36-2"><a href="#cb36-2" tabindex="-1"></a> <span class="fu">set</span>(<span class="st">"branches_k_color"</span>, <span class="at">k =</span> <span class="dv">7</span>)</span>
|
|||
|
<span id="cb36-3"><a href="#cb36-3" tabindex="-1"></a>train_branches_colors <span class="ot"><-</span> <span class="fu">get_leaves_branches_col</span>(pre_tang_d_train_test<span class="sc">$</span>train)</span>
|
|||
|
<span id="cb36-4"><a href="#cb36-4" tabindex="-1"></a>pre_tang_d_train_test <span class="sc">%>%</span> <span class="fu">tanglegram</span>(<span class="at">fast =</span> <span class="cn">TRUE</span>, <span class="at">color_lines =</span> train_branches_colors)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We can see that the top most (small) cluster is somewhat preserved
|
|||
|
between the two trees. However, a large spaghetti-like tangle of lines
|
|||
|
is indicating that the two trees are far from being identical.</p>
|
|||
|
<p>If we look only at subtrees of the two dendrograms so that they
|
|||
|
include only genes that are clustered with genes in both trees, we get
|
|||
|
only 14 genes (while the original trees had 306 genes). We can see how
|
|||
|
we have several groups of pairs of genes, and one group with four genes
|
|||
|
clustered together in both trees:</p>
|
|||
|
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" tabindex="-1"></a><span class="co"># This was calculated before</span></span>
|
|||
|
<span id="cb37-2"><a href="#cb37-2" tabindex="-1"></a><span class="co"># d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist</span></span>
|
|||
|
<span id="cb37-3"><a href="#cb37-3" tabindex="-1"></a><span class="co"># d_train_test_common</span></span>
|
|||
|
<span id="cb37-4"><a href="#cb37-4" tabindex="-1"></a>d_train_test_common <span class="sc">%>%</span> untangle <span class="sc">%>%</span> <span class="fu">tanglegram</span>(<span class="at">common_subtrees_color_branches =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>Trees’ sizes:</p>
|
|||
|
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" tabindex="-1"></a>d_train_test <span class="sc">%>%</span> nleaves</span></code></pre></div>
|
|||
|
<pre><code>## train test
|
|||
|
## 306 306</code></pre>
|
|||
|
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" tabindex="-1"></a>d_train_test_common <span class="sc">%>%</span> nleaves</span></code></pre></div>
|
|||
|
<pre><code>## train test
|
|||
|
## 14 14</code></pre>
|
|||
|
</div>
|
|||
|
<div id="conclusion-1" class="section level3">
|
|||
|
<h3>Conclusion</h3>
|
|||
|
<p>To conclude: we see that the clustering algorithm resulted in trees
|
|||
|
which are significantly similar in both the training and the test data
|
|||
|
sets beyond chance, but that this similarity is restricted to only a
|
|||
|
very small proportion of genes.</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div id="votes.repub---votes-for-republican-candidate-in-presidential-elections" class="section level2">
|
|||
|
<h2>votes.repub - Votes for Republican Candidate in Presidential
|
|||
|
Elections</h2>
|
|||
|
<div id="background-2" class="section level3">
|
|||
|
<h3>Background</h3>
|
|||
|
<blockquote>
|
|||
|
<p>This is a data frame with the percentage of votes given to the
|
|||
|
republican candidate in presidential elections from 1856 to 1976. Rows
|
|||
|
represent the 50 states, and columns the 31 elections.</p>
|
|||
|
</blockquote>
|
|||
|
<blockquote>
|
|||
|
<p>Source: S. Peterson (1973): A Statistical History of the American
|
|||
|
Presidential Elections. New York: Frederick Ungar Publishing Co. Data
|
|||
|
from 1964 to 1976 is from R. M. Scammon, American Votes 12,
|
|||
|
Congressional Quarterly.</p>
|
|||
|
</blockquote>
|
|||
|
<p>Define variables:</p>
|
|||
|
<div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" tabindex="-1"></a>votes.repub <span class="ot"><-</span> cluster<span class="sc">::</span>votes.repub</span></code></pre></div>
|
|||
|
<p>These data can be visualized using a (costumed made) parallel
|
|||
|
coordinates plot:</p>
|
|||
|
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" tabindex="-1"></a>years <span class="ot"><-</span> <span class="fu">as.numeric</span>(<span class="fu">gsub</span>(<span class="st">"X"</span>, <span class="st">""</span>, <span class="fu">colnames</span>(votes.repub)))</span>
|
|||
|
<span id="cb43-2"><a href="#cb43-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb43-3"><a href="#cb43-3" tabindex="-1"></a><span class="fu">par</span>(<span class="at">las =</span> <span class="dv">2</span>, <span class="at">mar =</span> <span class="fu">c</span>(<span class="fl">4.5</span>, <span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">2</span>) <span class="sc">+</span> <span class="fl">0.1</span>, <span class="at">cex =</span> .<span class="dv">8</span>)</span>
|
|||
|
<span id="cb43-4"><a href="#cb43-4" tabindex="-1"></a><span class="co"># MASS::parcoord(votes.repub, var.label = FALSE, lwd = 1)</span></span>
|
|||
|
<span id="cb43-5"><a href="#cb43-5" tabindex="-1"></a><span class="fu">matplot</span>(<span class="dv">1</span><span class="dt">L</span><span class="sc">:</span><span class="fu">ncol</span>(votes.repub), <span class="fu">t</span>(votes.repub), <span class="at">type =</span> <span class="st">"l"</span>, <span class="at">col =</span> <span class="dv">1</span>, <span class="at">lty =</span> <span class="dv">1</span>,</span>
|
|||
|
<span id="cb43-6"><a href="#cb43-6" tabindex="-1"></a> <span class="at">axes =</span> F, <span class="at">xlab =</span> <span class="st">""</span>, <span class="at">ylab =</span> <span class="st">""</span>)</span>
|
|||
|
<span id="cb43-7"><a href="#cb43-7" tabindex="-1"></a><span class="fu">axis</span>(<span class="dv">1</span>, <span class="at">at =</span> <span class="fu">seq_along</span>(years), <span class="at">labels =</span> years)</span>
|
|||
|
<span id="cb43-8"><a href="#cb43-8" tabindex="-1"></a><span class="fu">axis</span>(<span class="dv">2</span>)</span>
|
|||
|
<span id="cb43-9"><a href="#cb43-9" tabindex="-1"></a><span class="co"># Add Title</span></span>
|
|||
|
<span id="cb43-10"><a href="#cb43-10" tabindex="-1"></a><span class="fu">title</span>(<span class="st">"Votes for Republican Candidate</span><span class="sc">\n</span><span class="st"> in Presidential Elections </span><span class="sc">\n</span><span class="st"> (each line is a country - over the years)"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
</div>
|
|||
|
<div id="heatmap" class="section level3">
|
|||
|
<h3>Heatmap</h3>
|
|||
|
<p>This is a nice example when the parallel coordinates plot has some
|
|||
|
serious limitations: it does not help us detect the states, we fail to
|
|||
|
see the missing value patterns, and it is tricky to see clusters in
|
|||
|
general (due to the large number of threads).</p>
|
|||
|
<p>For these data, it can be quite helpful to see a heatmap of the votes
|
|||
|
across the years. The ordering of the rows is tricky. First, the
|
|||
|
distance of the vectors (later used for the clustering) should be done
|
|||
|
after transformation (since we are dealing with proportion of votes). In
|
|||
|
this case, I used the arcsin transformation (a logit transformation
|
|||
|
could also work, but the arcsin is safer for dealing with 0/1
|
|||
|
observations). But given the clusters, we wish to order the leaves (as
|
|||
|
much as possible), in order to take into account the missing value
|
|||
|
clusterings. So we, in fact, have two clusters, one for the raw values,
|
|||
|
and another for the “shadow matrix” (i.e.: the matrix with 0/1,
|
|||
|
indicating if a value was missing or not).</p>
|
|||
|
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" tabindex="-1"></a>arcsin_transformation <span class="ot"><-</span> <span class="cf">function</span>(x) <span class="fu">asin</span>(x<span class="sc">/</span><span class="dv">100</span>)</span>
|
|||
|
<span id="cb44-2"><a href="#cb44-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb44-3"><a href="#cb44-3" tabindex="-1"></a>dend_NA <span class="ot"><-</span> votes.repub <span class="sc">%>%</span> is.na <span class="sc">%>%</span></span>
|
|||
|
<span id="cb44-4"><a href="#cb44-4" tabindex="-1"></a> dist <span class="sc">%>%</span> hclust <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span> ladderize</span>
|
|||
|
<span id="cb44-5"><a href="#cb44-5" tabindex="-1"></a></span>
|
|||
|
<span id="cb44-6"><a href="#cb44-6" tabindex="-1"></a>dend <span class="ot"><-</span> votes.repub <span class="sc">%>%</span> arcsin_transformation <span class="sc">%>%</span></span>
|
|||
|
<span id="cb44-7"><a href="#cb44-7" tabindex="-1"></a> dist <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">"com"</span>) <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span></span>
|
|||
|
<span id="cb44-8"><a href="#cb44-8" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%>%</span></span>
|
|||
|
<span id="cb44-9"><a href="#cb44-9" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>)</span>
|
|||
|
<span id="cb44-10"><a href="#cb44-10" tabindex="-1"></a></span>
|
|||
|
<span id="cb44-11"><a href="#cb44-11" tabindex="-1"></a><span class="co"># some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))</span></span>
|
|||
|
<span id="cb44-12"><a href="#cb44-12" tabindex="-1"></a>some_col_func <span class="ot"><-</span> colorspace<span class="sc">::</span>diverge_hcl</span>
|
|||
|
<span id="cb44-13"><a href="#cb44-13" tabindex="-1"></a></span>
|
|||
|
<span id="cb44-14"><a href="#cb44-14" tabindex="-1"></a></span>
|
|||
|
<span id="cb44-15"><a href="#cb44-15" tabindex="-1"></a><span class="co"># par(mar = c(3,3,3,3))</span></span>
|
|||
|
<span id="cb44-16"><a href="#cb44-16" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
|
|||
|
<span id="cb44-17"><a href="#cb44-17" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(votes.repub), </span>
|
|||
|
<span id="cb44-18"><a href="#cb44-18" tabindex="-1"></a> <span class="at">main =</span> <span class="st">"Votes for</span><span class="sc">\n</span><span class="st"> Republican Presidential Candidate</span><span class="sc">\n</span><span class="st"> (clustered using complete)"</span>,</span>
|
|||
|
<span id="cb44-19"><a href="#cb44-19" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">60</span>,</span>
|
|||
|
<span id="cb44-20"><a href="#cb44-20" tabindex="-1"></a> <span class="at">dendrogram =</span> <span class="st">"row"</span>,</span>
|
|||
|
<span id="cb44-21"><a href="#cb44-21" tabindex="-1"></a> <span class="at">Rowv =</span> dend,</span>
|
|||
|
<span id="cb44-22"><a href="#cb44-22" tabindex="-1"></a> <span class="at">Colv =</span> <span class="st">"NA"</span>, <span class="co"># this to make sure the columns are not ordered</span></span>
|
|||
|
<span id="cb44-23"><a href="#cb44-23" tabindex="-1"></a> <span class="at">trace=</span><span class="st">"none"</span>, </span>
|
|||
|
<span id="cb44-24"><a href="#cb44-24" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">6</span>), </span>
|
|||
|
<span id="cb44-25"><a href="#cb44-25" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">"% Votes for Republican</span><span class="sc">\n</span><span class="st"> Presidential Candidate"</span>,</span>
|
|||
|
<span id="cb44-26"><a href="#cb44-26" tabindex="-1"></a> <span class="at">labCol =</span> years,</span>
|
|||
|
<span id="cb44-27"><a href="#cb44-27" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">"grey"</span>,</span>
|
|||
|
<span id="cb44-28"><a href="#cb44-28" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">"density"</span>,</span>
|
|||
|
<span id="cb44-29"><a href="#cb44-29" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
|
|||
|
<span id="cb44-30"><a href="#cb44-30" tabindex="-1"></a> )</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" tabindex="-1"></a> <span class="co"># RowSideColors = rev(labels_colors(dend)), # to add nice colored strips </span></span></code></pre></div>
|
|||
|
<p>How much of a difference would we get if we used another clustering
|
|||
|
algorithm?</p>
|
|||
|
<p>We first calculate the clustering using 8 different methods:</p>
|
|||
|
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" tabindex="-1"></a>hclust_methods <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"ward.D"</span>, <span class="st">"single"</span>, <span class="st">"complete"</span>, <span class="st">"average"</span>, <span class="st">"mcquitty"</span>, </span>
|
|||
|
<span id="cb46-2"><a href="#cb46-2" tabindex="-1"></a> <span class="st">"median"</span>, <span class="st">"centroid"</span>, <span class="st">"ward.D2"</span>)</span>
|
|||
|
<span id="cb46-3"><a href="#cb46-3" tabindex="-1"></a>votes.repub_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>()</span>
|
|||
|
<span id="cb46-4"><a href="#cb46-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb46-5"><a href="#cb46-5" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
|
|||
|
<span id="cb46-6"><a href="#cb46-6" tabindex="-1"></a> tmp_dend <span class="ot"><-</span> votes.repub <span class="sc">%>%</span> arcsin_transformation <span class="sc">%>%</span> dist <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> hclust_methods[i]) <span class="sc">%>%</span> as.dendrogram </span>
|
|||
|
<span id="cb46-7"><a href="#cb46-7" tabindex="-1"></a> votes.repub_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>(votes.repub_dendlist, tmp_dend)</span>
|
|||
|
<span id="cb46-8"><a href="#cb46-8" tabindex="-1"></a>}</span>
|
|||
|
<span id="cb46-9"><a href="#cb46-9" tabindex="-1"></a><span class="fu">names</span>(votes.repub_dendlist) <span class="ot"><-</span> hclust_methods</span>
|
|||
|
<span id="cb46-10"><a href="#cb46-10" tabindex="-1"></a><span class="co"># votes.repub_dendlist</span></span></code></pre></div>
|
|||
|
<p>Next, we can look at the cophenetic correlation between each
|
|||
|
clustering result using <code>cor.dendlist</code>. (This can be nicely
|
|||
|
plotted using the <code>corrplot</code> function from the
|
|||
|
<em>corrplot</em> package):</p>
|
|||
|
<div class="sourceCode" id="cb47"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(<span class="fu">cor.dendlist</span>(votes.repub_dendlist), <span class="st">"pie"</span>, <span class="st">"lower"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We see that the “complete” method is somewhat similar to the
|
|||
|
ward.D/ward.D2 methods, but there is less similarity with the other
|
|||
|
methods. We can see that the methods “average”, “mcquitty” and “median”,
|
|||
|
all give somewhat similar results. So by using “average”, we will see an
|
|||
|
alternative presentation that represents (in a sense) three other
|
|||
|
clustering solutions.</p>
|
|||
|
<p>We can look at the heatmap of the “average” method. However, as you
|
|||
|
can see, it is not very helpful in seeing the difference between the two
|
|||
|
clustering solutions.</p>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>Let’s look at the tanglegram of the two methods to get a better
|
|||
|
insight into the differences between the two:</p>
|
|||
|
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" tabindex="-1"></a>dend_com <span class="ot"><-</span> votes.repub <span class="sc">%>%</span> arcsin_transformation <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-2"><a href="#cb48-2" tabindex="-1"></a> dist <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">"com"</span>) <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-3"><a href="#cb48-3" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-4"><a href="#cb48-4" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>) <span class="co"># %>% ladderize</span></span>
|
|||
|
<span id="cb48-5"><a href="#cb48-5" tabindex="-1"></a>dend_ave <span class="ot"><-</span> votes.repub <span class="sc">%>%</span> arcsin_transformation <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-6"><a href="#cb48-6" tabindex="-1"></a> dist <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">"ave"</span>) <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-7"><a href="#cb48-7" tabindex="-1"></a> <span class="fu">rotate</span>(<span class="fu">labels</span>(dend_NA)) <span class="sc">%>%</span></span>
|
|||
|
<span id="cb48-8"><a href="#cb48-8" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>) <span class="co"># %>% ladderize</span></span>
|
|||
|
<span id="cb48-9"><a href="#cb48-9" tabindex="-1"></a></span>
|
|||
|
<span id="cb48-10"><a href="#cb48-10" tabindex="-1"></a><span class="co"># The orders were predefined after using untangle("step2side")</span></span>
|
|||
|
<span id="cb48-11"><a href="#cb48-11" tabindex="-1"></a><span class="co"># They are omitted here to save running time.</span></span>
|
|||
|
<span id="cb48-12"><a href="#cb48-12" tabindex="-1"></a>dend_com <span class="ot"><-</span> <span class="fu">rotate</span>(dend_com, ord1)</span>
|
|||
|
<span id="cb48-13"><a href="#cb48-13" tabindex="-1"></a>dend_ave <span class="ot"><-</span> <span class="fu">rotate</span>(dend_ave, ord2)</span>
|
|||
|
<span id="cb48-14"><a href="#cb48-14" tabindex="-1"></a></span>
|
|||
|
<span id="cb48-15"><a href="#cb48-15" tabindex="-1"></a>dends <span class="ot"><-</span> <span class="fu">dendlist</span>(<span class="at">complete =</span> dend_com, <span class="at">average =</span> dend_ave) <span class="co"># %>% untangle("step2side")</span></span>
|
|||
|
<span id="cb48-16"><a href="#cb48-16" tabindex="-1"></a>dends <span class="sc">%>%</span> <span class="fu">tanglegram</span>(<span class="at">margin_inner =</span> <span class="dv">7</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We see that the two clusterings give similar results for: “Alabama”,
|
|||
|
“Georgia”, “Louisiana”, “Arkansas”, “Florida”, “Texas”, “South
|
|||
|
Carolina”, “Mississippi”.</p>
|
|||
|
<p>There are also several other sub-trees which are identical between
|
|||
|
the two methods. The biggest difference lies in several “rouge” states
|
|||
|
that are placed differently in the two clustering algorithms. They are:
|
|||
|
Vermont, Michigan, Maine, Hawaii, New Jersey, West Virginia, and
|
|||
|
Oklahoma.</p>
|
|||
|
<p>A better understanding of the data requires a much more in-depth
|
|||
|
historical perspective than is within the scope of the current
|
|||
|
analysis.</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div id="animals---attributes-of-animals" class="section level2">
|
|||
|
<h2>animals - Attributes of Animals</h2>
|
|||
|
<div id="background-3" class="section level3">
|
|||
|
<h3>Background</h3>
|
|||
|
<blockquote>
|
|||
|
<p>This data set considers 6 binary attributes for 20 animals.</p>
|
|||
|
</blockquote>
|
|||
|
<blockquote>
|
|||
|
<p>see Struyf, Hubert & Rousseeuw (1996), in agnes.</p>
|
|||
|
</blockquote>
|
|||
|
<p>Define variables:</p>
|
|||
|
<div class="sourceCode" id="cb49"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" tabindex="-1"></a>animals <span class="ot"><-</span> cluster<span class="sc">::</span>animals</span>
|
|||
|
<span id="cb49-2"><a href="#cb49-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb49-3"><a href="#cb49-3" tabindex="-1"></a><span class="fu">colnames</span>(animals) <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"warm-blooded"</span>, </span>
|
|||
|
<span id="cb49-4"><a href="#cb49-4" tabindex="-1"></a> <span class="st">"can fly"</span>,</span>
|
|||
|
<span id="cb49-5"><a href="#cb49-5" tabindex="-1"></a> <span class="st">"vertebrate"</span>,</span>
|
|||
|
<span id="cb49-6"><a href="#cb49-6" tabindex="-1"></a> <span class="st">"endangered"</span>,</span>
|
|||
|
<span id="cb49-7"><a href="#cb49-7" tabindex="-1"></a> <span class="st">"live in groups"</span>,</span>
|
|||
|
<span id="cb49-8"><a href="#cb49-8" tabindex="-1"></a> <span class="st">"have hair"</span>)</span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="heatmap-1" class="section level3">
|
|||
|
<h3>Heatmap</h3>
|
|||
|
<p>This is a good example for using a heatmap + colored branches.</p>
|
|||
|
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" tabindex="-1"></a>dend_r <span class="ot"><-</span> animals <span class="sc">%>%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">"man"</span>) <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">"ward.D"</span>) <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span> ladderize <span class="sc">%>%</span></span>
|
|||
|
<span id="cb50-2"><a href="#cb50-2" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">4</span>)</span>
|
|||
|
<span id="cb50-3"><a href="#cb50-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-4"><a href="#cb50-4" tabindex="-1"></a>dend_c <span class="ot"><-</span> <span class="fu">t</span>(animals) <span class="sc">%>%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">"man"</span>) <span class="sc">%>%</span> <span class="fu">hclust</span>(<span class="at">method =</span> <span class="st">"com"</span>) <span class="sc">%>%</span> as.dendrogram <span class="sc">%>%</span> ladderize<span class="sc">%>%</span></span>
|
|||
|
<span id="cb50-5"><a href="#cb50-5" tabindex="-1"></a> <span class="fu">color_branches</span>(<span class="at">k=</span><span class="dv">3</span>)</span>
|
|||
|
<span id="cb50-6"><a href="#cb50-6" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-7"><a href="#cb50-7" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-8"><a href="#cb50-8" tabindex="-1"></a><span class="co"># some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))</span></span>
|
|||
|
<span id="cb50-9"><a href="#cb50-9" tabindex="-1"></a><span class="co"># some_col_func <- colorspace::diverge_hcl</span></span>
|
|||
|
<span id="cb50-10"><a href="#cb50-10" tabindex="-1"></a><span class="co"># some_col_func <- colorspace::sequential_hcl</span></span>
|
|||
|
<span id="cb50-11"><a href="#cb50-11" tabindex="-1"></a>some_col_func <span class="ot"><-</span> <span class="cf">function</span>(n) (colorspace<span class="sc">::</span><span class="fu">diverge_hcl</span>(n, <span class="at">h =</span> <span class="fu">c</span>(<span class="dv">246</span>, <span class="dv">40</span>), <span class="at">c =</span> <span class="dv">96</span>, <span class="at">l =</span> <span class="fu">c</span>(<span class="dv">65</span>, <span class="dv">90</span>)))</span>
|
|||
|
<span id="cb50-12"><a href="#cb50-12" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-13"><a href="#cb50-13" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-14"><a href="#cb50-14" tabindex="-1"></a></span>
|
|||
|
<span id="cb50-15"><a href="#cb50-15" tabindex="-1"></a><span class="co"># par(mar = c(3,3,3,3))</span></span>
|
|||
|
<span id="cb50-16"><a href="#cb50-16" tabindex="-1"></a><span class="co"># library(gplots)</span></span>
|
|||
|
<span id="cb50-17"><a href="#cb50-17" tabindex="-1"></a>gplots<span class="sc">::</span><span class="fu">heatmap.2</span>(<span class="fu">as.matrix</span>(animals<span class="dv">-1</span>), </span>
|
|||
|
<span id="cb50-18"><a href="#cb50-18" tabindex="-1"></a> <span class="at">main =</span> <span class="st">"Attributes of Animals"</span>,</span>
|
|||
|
<span id="cb50-19"><a href="#cb50-19" tabindex="-1"></a> <span class="at">srtCol =</span> <span class="dv">35</span>,</span>
|
|||
|
<span id="cb50-20"><a href="#cb50-20" tabindex="-1"></a> <span class="at">Rowv =</span> dend_r,</span>
|
|||
|
<span id="cb50-21"><a href="#cb50-21" tabindex="-1"></a> <span class="at">Colv =</span> dend_c,</span>
|
|||
|
<span id="cb50-22"><a href="#cb50-22" tabindex="-1"></a> <span class="at">trace=</span><span class="st">"row"</span>, <span class="at">hline =</span> <span class="cn">NA</span>, <span class="at">tracecol =</span> <span class="st">"darkgrey"</span>, </span>
|
|||
|
<span id="cb50-23"><a href="#cb50-23" tabindex="-1"></a> <span class="at">margins =</span><span class="fu">c</span>(<span class="dv">6</span>,<span class="dv">3</span>), </span>
|
|||
|
<span id="cb50-24"><a href="#cb50-24" tabindex="-1"></a> <span class="at">key.xlab =</span> <span class="st">"no / yes"</span>,</span>
|
|||
|
<span id="cb50-25"><a href="#cb50-25" tabindex="-1"></a> <span class="at">denscol =</span> <span class="st">"grey"</span>,</span>
|
|||
|
<span id="cb50-26"><a href="#cb50-26" tabindex="-1"></a> <span class="at">density.info =</span> <span class="st">"density"</span>,</span>
|
|||
|
<span id="cb50-27"><a href="#cb50-27" tabindex="-1"></a> <span class="at">col =</span> some_col_func</span>
|
|||
|
<span id="cb50-28"><a href="#cb50-28" tabindex="-1"></a> )</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We see that we have several groups of variables: the “can fly” and
|
|||
|
“endangered” (which usually are both “no”), the “have hair”, and the
|
|||
|
“warm-blooded”, “vertebrate”, and “live in groups”.</p>
|
|||
|
<p>We see that within the animals there are (roughly!) the following 4
|
|||
|
groups:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li>The cold-blooded non-vertebrates, which are mostly not
|
|||
|
endangered.</li>
|
|||
|
<li>The warm-blooded vertebrates, which live in groups, have hair,
|
|||
|
cannot fly, and mostly are not endangered.</li>
|
|||
|
<li>The cold-blooded vertebrates, without hair, cannot fly, and are not
|
|||
|
endangered.</li>
|
|||
|
<li>The (mostly) warm-blooded vertebrates, without hair, some can fly,
|
|||
|
and some are endangered.</li>
|
|||
|
</ol>
|
|||
|
<p>How much of a difference would we get if we used another clustering
|
|||
|
algorithm?</p>
|
|||
|
<p>We first calculate the clustering using 8 different methods:</p>
|
|||
|
<div class="sourceCode" id="cb51"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" tabindex="-1"></a>hclust_methods <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"ward.D"</span>, <span class="st">"single"</span>, <span class="st">"complete"</span>, <span class="st">"average"</span>, <span class="st">"mcquitty"</span>, </span>
|
|||
|
<span id="cb51-2"><a href="#cb51-2" tabindex="-1"></a> <span class="st">"median"</span>, <span class="st">"centroid"</span>, <span class="st">"ward.D2"</span>)</span>
|
|||
|
<span id="cb51-3"><a href="#cb51-3" tabindex="-1"></a>animals_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>()</span>
|
|||
|
<span id="cb51-4"><a href="#cb51-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb51-5"><a href="#cb51-5" tabindex="-1"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(hclust_methods)) {</span>
|
|||
|
<span id="cb51-6"><a href="#cb51-6" tabindex="-1"></a> tmp_dend <span class="ot"><-</span> animals <span class="sc">%>%</span> <span class="fu">dist</span>(<span class="at">method =</span> <span class="st">"man"</span>) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb51-7"><a href="#cb51-7" tabindex="-1"></a> <span class="fu">hclust</span>(<span class="at">method =</span> hclust_methods[i]) <span class="sc">%>%</span> as.dendrogram </span>
|
|||
|
<span id="cb51-8"><a href="#cb51-8" tabindex="-1"></a> animals_dendlist <span class="ot"><-</span> <span class="fu">dendlist</span>(animals_dendlist, tmp_dend)</span>
|
|||
|
<span id="cb51-9"><a href="#cb51-9" tabindex="-1"></a>}</span>
|
|||
|
<span id="cb51-10"><a href="#cb51-10" tabindex="-1"></a><span class="fu">names</span>(animals_dendlist) <span class="ot"><-</span> hclust_methods</span>
|
|||
|
<span id="cb51-11"><a href="#cb51-11" tabindex="-1"></a><span class="co"># votes.repub_dendlist</span></span></code></pre></div>
|
|||
|
<p>Next, we can look at the cophenetic correlation between each
|
|||
|
clustering result using <code>cor.dendlist</code> (This can be nicely
|
|||
|
plotted using the <code>corrplot</code> function from the
|
|||
|
<em>corrplot</em> package):</p>
|
|||
|
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" tabindex="-1"></a>cophenetic_cors <span class="ot"><-</span> <span class="fu">cor.dendlist</span>(animals_dendlist)</span>
|
|||
|
<span id="cb52-2"><a href="#cb52-2" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(cophenetic_cors, <span class="st">"pie"</span>, <span class="st">"lower"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We see that the different methods (other than ward.D and ward.D2),
|
|||
|
all give quite different results. So would the above analysis be
|
|||
|
different if we had used another clustering algorithm?</p>
|
|||
|
<p>For this purpose, we compare the clustering solution of each
|
|||
|
algorithm with one another, when cut to k=4 clusters, using the
|
|||
|
Fowlkes-Mallows Index. This measure is similar to rand (or rand
|
|||
|
adjusted) index, and gives a value of 1 when the two clusters conform,
|
|||
|
and 0 when they do not:</p>
|
|||
|
<div class="sourceCode" id="cb53"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb53-1"><a href="#cb53-1" tabindex="-1"></a>remove_median <span class="ot"><-</span> <span class="fu">dendlist</span>(animals_dendlist, <span class="at">which =</span> <span class="fu">c</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">8</span>)[<span class="sc">-</span><span class="dv">6</span>] )</span>
|
|||
|
<span id="cb53-2"><a href="#cb53-2" tabindex="-1"></a>FM_cors <span class="ot"><-</span> <span class="fu">cor.dendlist</span>(remove_median, <span class="at">method =</span> <span class="st">"FM_index"</span>, <span class="at">k =</span> <span class="dv">4</span>)</span>
|
|||
|
<span id="cb53-3"><a href="#cb53-3" tabindex="-1"></a>corrplot<span class="sc">::</span><span class="fu">corrplot</span>(FM_cors, <span class="st">"pie"</span>, <span class="st">"lower"</span>)</span></code></pre></div>
|
|||
|
<p><img role="img" src="
|
|||
|
<p>We removed the “median” method since it did not have k=4 possible. In
|
|||
|
general, the results seems sensitive to the algorithm used, and the
|
|||
|
different algorithm methods do not seem to agree with one another (with
|
|||
|
regards to k=4), so further analyses may be in place in order to decide
|
|||
|
on which algorithm and interpretation are most appropriate for these
|
|||
|
data.</p>
|
|||
|
<p>(Other possible data sets for the future: chorSub, flower,
|
|||
|
plantTraits, pluton, ruspini, agriculture)</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<!-- code folding -->
|
|||
|
|
|||
|
|
|||
|
<!-- dynamically load mathjax for compatibility with self-contained -->
|
|||
|
<script>
|
|||
|
(function () {
|
|||
|
var script = document.createElement("script");
|
|||
|
script.type = "text/javascript";
|
|||
|
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
|||
|
document.getElementsByTagName("head")[0].appendChild(script);
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|