812 lines
50 KiB
HTML
812 lines
50 KiB
HTML
|
<!DOCTYPE html>
|
|||
|
|
|||
|
<html>
|
|||
|
|
|||
|
<head>
|
|||
|
|
|||
|
<meta charset="utf-8" />
|
|||
|
<meta name="generator" content="pandoc" />
|
|||
|
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
|||
|
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<title>Type and size stability</title>
|
|||
|
|
|||
|
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
|||
|
// be compatible with the behavior of Pandoc < 2.8).
|
|||
|
document.addEventListener('DOMContentLoaded', function(e) {
|
|||
|
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
|||
|
var i, h, a;
|
|||
|
for (i = 0; i < hs.length; i++) {
|
|||
|
h = hs[i];
|
|||
|
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
|||
|
a = h.attributes;
|
|||
|
while (a.length > 0) h.removeAttribute(a[0].name);
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code{white-space: pre-wrap;}
|
|||
|
span.smallcaps{font-variant: small-caps;}
|
|||
|
span.underline{text-decoration: underline;}
|
|||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
|||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
|||
|
ul.task-list{list-style: none;}
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code {
|
|||
|
white-space: pre;
|
|||
|
}
|
|||
|
.sourceCode {
|
|||
|
overflow: visible;
|
|||
|
}
|
|||
|
</style>
|
|||
|
<style type="text/css" data-origin="pandoc">
|
|||
|
pre > code.sourceCode { white-space: pre; position: relative; }
|
|||
|
pre > code.sourceCode > span { line-height: 1.25; }
|
|||
|
pre > code.sourceCode > span:empty { height: 1.2em; }
|
|||
|
.sourceCode { overflow: visible; }
|
|||
|
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
|||
|
div.sourceCode { margin: 1em 0; }
|
|||
|
pre.sourceCode { margin: 0; }
|
|||
|
@media screen {
|
|||
|
div.sourceCode { overflow: auto; }
|
|||
|
}
|
|||
|
@media print {
|
|||
|
pre > code.sourceCode { white-space: pre-wrap; }
|
|||
|
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
|||
|
}
|
|||
|
pre.numberSource code
|
|||
|
{ counter-reset: source-line 0; }
|
|||
|
pre.numberSource code > span
|
|||
|
{ position: relative; left: -4em; counter-increment: source-line; }
|
|||
|
pre.numberSource code > span > a:first-child::before
|
|||
|
{ content: counter(source-line);
|
|||
|
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
|||
|
border: none; display: inline-block;
|
|||
|
-webkit-touch-callout: none; -webkit-user-select: none;
|
|||
|
-khtml-user-select: none; -moz-user-select: none;
|
|||
|
-ms-user-select: none; user-select: none;
|
|||
|
padding: 0 4px; width: 4em;
|
|||
|
color: #aaaaaa;
|
|||
|
}
|
|||
|
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
|||
|
div.sourceCode
|
|||
|
{ }
|
|||
|
@media screen {
|
|||
|
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
|||
|
}
|
|||
|
code span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.at { color: #7d9029; }
|
|||
|
code span.bn { color: #40a070; }
|
|||
|
code span.bu { color: #008000; }
|
|||
|
code span.cf { color: #007020; font-weight: bold; }
|
|||
|
code span.ch { color: #4070a0; }
|
|||
|
code span.cn { color: #880000; }
|
|||
|
code span.co { color: #60a0b0; font-style: italic; }
|
|||
|
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.do { color: #ba2121; font-style: italic; }
|
|||
|
code span.dt { color: #902000; }
|
|||
|
code span.dv { color: #40a070; }
|
|||
|
code span.er { color: #ff0000; font-weight: bold; }
|
|||
|
code span.ex { }
|
|||
|
code span.fl { color: #40a070; }
|
|||
|
code span.fu { color: #06287e; }
|
|||
|
code span.im { color: #008000; font-weight: bold; }
|
|||
|
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.kw { color: #007020; font-weight: bold; }
|
|||
|
code span.op { color: #666666; }
|
|||
|
code span.ot { color: #007020; }
|
|||
|
code span.pp { color: #bc7a00; }
|
|||
|
code span.sc { color: #4070a0; }
|
|||
|
code span.ss { color: #bb6688; }
|
|||
|
code span.st { color: #4070a0; }
|
|||
|
code span.va { color: #19177c; }
|
|||
|
code span.vs { color: #4070a0; }
|
|||
|
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
</style>
|
|||
|
<script>
|
|||
|
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
|||
|
(function() {
|
|||
|
var sheets = document.styleSheets;
|
|||
|
for (var i = 0; i < sheets.length; i++) {
|
|||
|
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
|||
|
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
|||
|
var j = 0;
|
|||
|
while (j < rules.length) {
|
|||
|
var rule = rules[j];
|
|||
|
// check if there is a div.sourceCode rule
|
|||
|
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
var style = rule.style.cssText;
|
|||
|
// check if color or background-color is set
|
|||
|
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
// replace div.sourceCode by a pre.sourceCode rule
|
|||
|
sheets[i].deleteRule(j);
|
|||
|
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
|||
|
}
|
|||
|
}
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">body {
|
|||
|
background-color: #fff;
|
|||
|
margin: 1em auto;
|
|||
|
max-width: 700px;
|
|||
|
overflow: visible;
|
|||
|
padding-left: 2em;
|
|||
|
padding-right: 2em;
|
|||
|
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
|||
|
font-size: 14px;
|
|||
|
line-height: 1.35;
|
|||
|
}
|
|||
|
#TOC {
|
|||
|
clear: both;
|
|||
|
margin: 0 0 10px 10px;
|
|||
|
padding: 4px;
|
|||
|
width: 400px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
border-radius: 5px;
|
|||
|
background-color: #f6f6f6;
|
|||
|
font-size: 13px;
|
|||
|
line-height: 1.3;
|
|||
|
}
|
|||
|
#TOC .toctitle {
|
|||
|
font-weight: bold;
|
|||
|
font-size: 15px;
|
|||
|
margin-left: 5px;
|
|||
|
}
|
|||
|
#TOC ul {
|
|||
|
padding-left: 40px;
|
|||
|
margin-left: -1.5em;
|
|||
|
margin-top: 5px;
|
|||
|
margin-bottom: 5px;
|
|||
|
}
|
|||
|
#TOC ul ul {
|
|||
|
margin-left: -2em;
|
|||
|
}
|
|||
|
#TOC li {
|
|||
|
line-height: 16px;
|
|||
|
}
|
|||
|
table {
|
|||
|
margin: 1em auto;
|
|||
|
border-width: 1px;
|
|||
|
border-color: #DDDDDD;
|
|||
|
border-style: outset;
|
|||
|
border-collapse: collapse;
|
|||
|
}
|
|||
|
table th {
|
|||
|
border-width: 2px;
|
|||
|
padding: 5px;
|
|||
|
border-style: inset;
|
|||
|
}
|
|||
|
table td {
|
|||
|
border-width: 1px;
|
|||
|
border-style: inset;
|
|||
|
line-height: 18px;
|
|||
|
padding: 5px 5px;
|
|||
|
}
|
|||
|
table, table th, table td {
|
|||
|
border-left-style: none;
|
|||
|
border-right-style: none;
|
|||
|
}
|
|||
|
table thead, table tr.even {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
p {
|
|||
|
margin: 0.5em 0;
|
|||
|
}
|
|||
|
blockquote {
|
|||
|
background-color: #f6f6f6;
|
|||
|
padding: 0.25em 0.75em;
|
|||
|
}
|
|||
|
hr {
|
|||
|
border-style: solid;
|
|||
|
border: none;
|
|||
|
border-top: 1px solid #777;
|
|||
|
margin: 28px 0;
|
|||
|
}
|
|||
|
dl {
|
|||
|
margin-left: 0;
|
|||
|
}
|
|||
|
dl dd {
|
|||
|
margin-bottom: 13px;
|
|||
|
margin-left: 13px;
|
|||
|
}
|
|||
|
dl dt {
|
|||
|
font-weight: bold;
|
|||
|
}
|
|||
|
ul {
|
|||
|
margin-top: 0;
|
|||
|
}
|
|||
|
ul li {
|
|||
|
list-style: circle outside;
|
|||
|
}
|
|||
|
ul ul {
|
|||
|
margin-bottom: 0;
|
|||
|
}
|
|||
|
pre, code {
|
|||
|
background-color: #f7f7f7;
|
|||
|
border-radius: 3px;
|
|||
|
color: #333;
|
|||
|
white-space: pre-wrap;
|
|||
|
}
|
|||
|
pre {
|
|||
|
border-radius: 3px;
|
|||
|
margin: 5px 0px 10px 0px;
|
|||
|
padding: 10px;
|
|||
|
}
|
|||
|
pre:not([class]) {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
code {
|
|||
|
font-family: Consolas, Monaco, 'Courier New', monospace;
|
|||
|
font-size: 85%;
|
|||
|
}
|
|||
|
p > code, li > code {
|
|||
|
padding: 2px 0px;
|
|||
|
}
|
|||
|
div.figure {
|
|||
|
text-align: center;
|
|||
|
}
|
|||
|
img {
|
|||
|
background-color: #FFFFFF;
|
|||
|
padding: 2px;
|
|||
|
border: 1px solid #DDDDDD;
|
|||
|
border-radius: 3px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
margin: 0 5px;
|
|||
|
}
|
|||
|
h1 {
|
|||
|
margin-top: 0;
|
|||
|
font-size: 35px;
|
|||
|
line-height: 40px;
|
|||
|
}
|
|||
|
h2 {
|
|||
|
border-bottom: 4px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
padding-bottom: 2px;
|
|||
|
font-size: 145%;
|
|||
|
}
|
|||
|
h3 {
|
|||
|
border-bottom: 2px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
font-size: 120%;
|
|||
|
}
|
|||
|
h4 {
|
|||
|
border-bottom: 1px solid #f7f7f7;
|
|||
|
margin-left: 8px;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
h5, h6 {
|
|||
|
border-bottom: 1px solid #ccc;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
a {
|
|||
|
color: #0033dd;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
a:hover {
|
|||
|
color: #6666ff; }
|
|||
|
a:visited {
|
|||
|
color: #800080; }
|
|||
|
a:visited:hover {
|
|||
|
color: #BB00BB; }
|
|||
|
a[href^="http:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
a[href^="https:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
|
|||
|
code > span.kw { color: #555; font-weight: bold; }
|
|||
|
code > span.dt { color: #902000; }
|
|||
|
code > span.dv { color: #40a070; }
|
|||
|
code > span.bn { color: #d14; }
|
|||
|
code > span.fl { color: #d14; }
|
|||
|
code > span.ch { color: #d14; }
|
|||
|
code > span.st { color: #d14; }
|
|||
|
code > span.co { color: #888888; font-style: italic; }
|
|||
|
code > span.ot { color: #007020; }
|
|||
|
code > span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code > span.fu { color: #900; font-weight: bold; }
|
|||
|
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</head>
|
|||
|
|
|||
|
<body>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<h1 class="title toc-ignore">Type and size stability</h1>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<p>This vignette introduces the ideas of type-stability and
|
|||
|
size-stability. If a function possesses these properties, it is
|
|||
|
substantially easier to reason about because to predict the “shape” of
|
|||
|
the output you only need to know the “shape”s of the inputs.</p>
|
|||
|
<p>This work is partly motivated by a common pattern that I noticed when
|
|||
|
reviewing code: if I read the code (without running it!), and I can’t
|
|||
|
predict the type of each variable, I feel very uneasy about the code.
|
|||
|
This sense is important because most unit tests explore typical inputs,
|
|||
|
rather than exhaustively testing the strange and unusual. Analysing the
|
|||
|
types (and size) of variables makes it possible to spot unpleasant edge
|
|||
|
cases.</p>
|
|||
|
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="fu">library</span>(vctrs)</span>
|
|||
|
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="fu">library</span>(rlang)</span>
|
|||
|
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a><span class="fu">library</span>(zeallot)</span></code></pre></div>
|
|||
|
<div id="definitions" class="section level2">
|
|||
|
<h2>Definitions</h2>
|
|||
|
<p>We say a function is <strong>type-stable</strong> iff:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li>You can predict the output type knowing only the input types.</li>
|
|||
|
<li>The order of arguments in … does not affect the output type.</li>
|
|||
|
</ol>
|
|||
|
<p>Similarly, a function is <strong>size-stable</strong> iff:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li>You can predict the output size knowing only the input sizes, or
|
|||
|
there is a single numeric input that specifies the output size.</li>
|
|||
|
</ol>
|
|||
|
<p>Very few base R functions are size-stable, so I’ll also define a
|
|||
|
slightly weaker condition. I’ll call a function
|
|||
|
<strong>length-stable</strong> iff:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li>You can predict the output <em>length</em> knowing only the input
|
|||
|
<em>lengths</em>, or there is a single numeric input that specifies the
|
|||
|
output <em>length</em>.</li>
|
|||
|
</ol>
|
|||
|
<p>(But note that length-stable is not a particularly robust definition
|
|||
|
because <code>length()</code> returns a value for things that are not
|
|||
|
vectors.)</p>
|
|||
|
<p>We’ll call functions that don’t obey these principles
|
|||
|
<strong>type-unstable</strong> and <strong>size-unstable</strong>
|
|||
|
respectively.</p>
|
|||
|
<p>On top of type- and size-stability it’s also desirable to have a
|
|||
|
single set of rules that are applied consistently. We want one set of
|
|||
|
type-coercion and size-recycling rules that apply everywhere, not many
|
|||
|
sets of rules that apply to different functions.</p>
|
|||
|
<p>The goal of these principles is to minimise cognitive overhead.
|
|||
|
Rather than having to memorise many special cases, you should be able to
|
|||
|
learn one set of principles and apply them again and again.</p>
|
|||
|
<div id="examples" class="section level3">
|
|||
|
<h3>Examples</h3>
|
|||
|
<p>To make these ideas concrete, let’s apply them to a few base
|
|||
|
functions:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li><p><code>mean()</code> is trivially type-stable and size-stable
|
|||
|
because it always returns a double vector of length 1 (or it throws an
|
|||
|
error).</p></li>
|
|||
|
<li><p>Surprisingly, <code>median()</code> is type-unstable:</p>
|
|||
|
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">median</span>(<span class="fu">c</span>(<span class="dv">1</span><span class="dt">L</span>, <span class="dv">1</span><span class="dt">L</span>)))</span>
|
|||
|
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="co">#> Prototype: double</span></span>
|
|||
|
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">median</span>(<span class="fu">c</span>(<span class="dv">1</span><span class="dt">L</span>, <span class="dv">1</span><span class="dt">L</span>, <span class="dv">1</span><span class="dt">L</span>)))</span>
|
|||
|
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="co">#> Prototype: integer</span></span></code></pre></div>
|
|||
|
<p>It is, however, size-stable, since it always returns a vector of
|
|||
|
length 1.</p></li>
|
|||
|
<li><p><code>sapply()</code> is type-unstable because you can’t predict
|
|||
|
the output type only knowing the input types:</p>
|
|||
|
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">sapply</span>(<span class="dv">1</span><span class="dt">L</span>, <span class="cf">function</span>(x) <span class="fu">c</span>(x, x)))</span>
|
|||
|
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="co">#> Prototype: integer[,1]</span></span>
|
|||
|
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">sapply</span>(<span class="fu">integer</span>(), <span class="cf">function</span>(x) <span class="fu">c</span>(x, x)))</span>
|
|||
|
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a><span class="co">#> Prototype: list</span></span></code></pre></div>
|
|||
|
<p>It’s not quite size-stable; <code>vec_size(sapply(x, f))</code> is
|
|||
|
<code>vec_size(x)</code> for vectors but not for matrices (the output is
|
|||
|
transposed) or data frames (it iterates over the columns).</p></li>
|
|||
|
<li><p><code>vapply()</code> is a type-stable version of
|
|||
|
<code>sapply()</code> because
|
|||
|
<code>vec_ptype_show(vapply(x, fn, template))</code> is always
|
|||
|
<code>vec_ptype_show(template)</code>.<br />
|
|||
|
It is size-unstable for the same reasons as
|
|||
|
<code>sapply()</code>.</p></li>
|
|||
|
<li><p><code>c()</code> is type-unstable because <code>c(x, y)</code>
|
|||
|
doesn’t always output the same type as <code>c(y, x)</code>.</p>
|
|||
|
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">c</span>(<span class="cn">NA</span>, <span class="fu">Sys.Date</span>()))</span>
|
|||
|
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="co">#> Prototype: double</span></span>
|
|||
|
<span id="cb4-3"><a href="#cb4-3" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">c</span>(<span class="fu">Sys.Date</span>(), <span class="cn">NA</span>))</span>
|
|||
|
<span id="cb4-4"><a href="#cb4-4" tabindex="-1"></a><span class="co">#> Prototype: date</span></span></code></pre></div>
|
|||
|
<p><code>c()</code> is <em>almost always</em> length-stable because
|
|||
|
<code>length(c(x, y))</code> <em>almost always</em> equals
|
|||
|
<code>length(x) + length(y)</code>. One common source of instability
|
|||
|
here is dealing with non-vectors (see the later section
|
|||
|
“Non-vectors”):</p>
|
|||
|
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>env <span class="ot"><-</span> <span class="fu">new.env</span>(<span class="at">parent =</span> <span class="fu">emptyenv</span>())</span>
|
|||
|
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="fu">length</span>(env)</span>
|
|||
|
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a><span class="co">#> [1] 0</span></span>
|
|||
|
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="fu">length</span>(mean)</span>
|
|||
|
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="co">#> [1] 1</span></span>
|
|||
|
<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a><span class="fu">length</span>(<span class="fu">c</span>(env, mean))</span>
|
|||
|
<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a><span class="co">#> [1] 2</span></span></code></pre></div></li>
|
|||
|
<li><p><code>paste(x1, x2)</code> is length-stable because
|
|||
|
<code>length(paste(x1, x2))</code> equals
|
|||
|
<code>max(length(x1), length(x2))</code>. However, it doesn’t follow the
|
|||
|
usual arithmetic recycling rules because <code>paste(1:2, 1:3)</code>
|
|||
|
doesn’t generate a warning.</p></li>
|
|||
|
<li><p><code>ifelse()</code> is length-stable because
|
|||
|
<code>length(ifelse(cond, true, false))</code> is always
|
|||
|
<code>length(cond)</code>. <code>ifelse()</code> is type-unstable
|
|||
|
because the output type depends on the value of <code>cond</code>:</p>
|
|||
|
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">ifelse</span>(<span class="cn">NA</span>, <span class="dv">1</span><span class="dt">L</span>, <span class="dv">1</span><span class="dt">L</span>))</span>
|
|||
|
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co">#> Prototype: logical</span></span>
|
|||
|
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a><span class="fu">vec_ptype_show</span>(<span class="fu">ifelse</span>(<span class="cn">FALSE</span>, <span class="dv">1</span><span class="dt">L</span>, <span class="dv">1</span><span class="dt">L</span>))</span>
|
|||
|
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a><span class="co">#> Prototype: integer</span></span></code></pre></div></li>
|
|||
|
<li><p><code>read.csv(file)</code> is type-unstable and size-unstable
|
|||
|
because, while you know it will return a data frame, you don’t know what
|
|||
|
columns it will return or how many rows it will have. Similarly,
|
|||
|
<code>df[[i]]</code> is not type-stable because the result depends on
|
|||
|
the <em>value</em> of <code>i</code>. There are many important functions
|
|||
|
that can not be made type-stable or size-stable!</p></li>
|
|||
|
</ol>
|
|||
|
<p>With this understanding of type- and size-stability in hand, we’ll
|
|||
|
use them to analyse some base R functions in greater depth and then
|
|||
|
propose alternatives with better properties.</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div id="c-and-vctrsvec_c" class="section level2">
|
|||
|
<h2><code>c()</code> and <code>vctrs::vec_c()</code></h2>
|
|||
|
<p>In this section we’ll compare and contrast <code>c()</code> and
|
|||
|
<code>vec_c()</code>. <code>vec_c()</code> is both type- and size-stable
|
|||
|
because it possesses the following invariants:</p>
|
|||
|
<ul>
|
|||
|
<li><code>vec_ptype(vec_c(x, y))</code> equals
|
|||
|
<code>vec_ptype_common(x, y)</code>.</li>
|
|||
|
<li><code>vec_size(vec_c(x, y))</code> equals
|
|||
|
<code>vec_size(x) + vec_size(y)</code>.</li>
|
|||
|
</ul>
|
|||
|
<p><code>c()</code> has another undesirable property in that it’s not
|
|||
|
consistent with <code>unlist()</code>; i.e.,
|
|||
|
<code>unlist(list(x, y))</code> does not always equal
|
|||
|
<code>c(x, y)</code>; i.e., base R has multiple sets of type-coercion
|
|||
|
rules. I won’t consider this problem further here.</p>
|
|||
|
<p>I have two goals here:</p>
|
|||
|
<ul>
|
|||
|
<li><p>To fully document the quirks of <code>c()</code>, hence
|
|||
|
motivating the development of an alternative.</p></li>
|
|||
|
<li><p>To discuss non-obvious consequences of the type- and
|
|||
|
size-stability above.</p></li>
|
|||
|
</ul>
|
|||
|
<div id="atomic-vectors" class="section level3">
|
|||
|
<h3>Atomic vectors</h3>
|
|||
|
<p>If we only consider atomic vectors, <code>c()</code> is type-stable
|
|||
|
because it uses a hierarchy of types: character > complex > double
|
|||
|
> integer > logical.</p>
|
|||
|
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">FALSE</span>, <span class="dv">1</span><span class="dt">L</span>, <span class="fl">2.5</span>)</span>
|
|||
|
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="co">#> [1] 0.0 1.0 2.5</span></span></code></pre></div>
|
|||
|
<p><code>vec_c()</code> obeys similar rules:</p>
|
|||
|
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">FALSE</span>, <span class="dv">1</span><span class="dt">L</span>, <span class="fl">2.5</span>)</span>
|
|||
|
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a><span class="co">#> [1] 0.0 1.0 2.5</span></span></code></pre></div>
|
|||
|
<p>But it does not automatically coerce to character vectors or
|
|||
|
lists:</p>
|
|||
|
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">FALSE</span>, <span class="st">"x"</span>)</span>
|
|||
|
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a><span class="co">#> [1] "FALSE" "x"</span></span>
|
|||
|
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">FALSE</span>, <span class="st">"x"</span>)</span>
|
|||
|
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a><span class="co">#> Error in `vec_c()`:</span></span>
|
|||
|
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a><span class="co">#> ! Can't combine `..1` <logical> and `..2` <character>.</span></span>
|
|||
|
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a></span>
|
|||
|
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">FALSE</span>, <span class="fu">list</span>(<span class="dv">1</span>))</span>
|
|||
|
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a><span class="co">#> [1] FALSE</span></span>
|
|||
|
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb9-12"><a href="#cb9-12" tabindex="-1"></a><span class="co">#> [1] 1</span></span>
|
|||
|
<span id="cb9-13"><a href="#cb9-13" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">FALSE</span>, <span class="fu">list</span>(<span class="dv">1</span>))</span>
|
|||
|
<span id="cb9-14"><a href="#cb9-14" tabindex="-1"></a><span class="co">#> Error in `vec_c()`:</span></span>
|
|||
|
<span id="cb9-15"><a href="#cb9-15" tabindex="-1"></a><span class="co">#> ! Can't combine `..1` <logical> and `..2` <list>.</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="incompatible-vectors-and-non-vectors" class="section level3">
|
|||
|
<h3>Incompatible vectors and non-vectors</h3>
|
|||
|
<p>In general, most base methods do not throw an error:</p>
|
|||
|
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a><span class="fu">c</span>(<span class="fl">10.5</span>, <span class="fu">factor</span>(<span class="st">"x"</span>))</span>
|
|||
|
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a><span class="co">#> [1] 10.5 1.0</span></span></code></pre></div>
|
|||
|
<p>If the inputs aren’t vectors, <code>c()</code> automatically puts
|
|||
|
them in a list:</p>
|
|||
|
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a><span class="fu">c</span>(mean, <span class="fu">globalenv</span>())</span>
|
|||
|
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a><span class="co">#> function (x, ...) </span></span>
|
|||
|
<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a><span class="co">#> UseMethod("mean")</span></span>
|
|||
|
<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a><span class="co">#> <bytecode: 0x103a05448></span></span>
|
|||
|
<span id="cb11-6"><a href="#cb11-6" tabindex="-1"></a><span class="co">#> <environment: namespace:base></span></span>
|
|||
|
<span id="cb11-7"><a href="#cb11-7" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb11-8"><a href="#cb11-8" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb11-9"><a href="#cb11-9" tabindex="-1"></a><span class="co">#> <environment: R_GlobalEnv></span></span></code></pre></div>
|
|||
|
<p>For numeric versions, this depends on the order of inputs. Version
|
|||
|
first is an error, otherwise the input is wrapped in a list:</p>
|
|||
|
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a><span class="fu">c</span>(<span class="fu">getRversion</span>(), <span class="st">"x"</span>)</span>
|
|||
|
<span id="cb12-2"><a href="#cb12-2" tabindex="-1"></a><span class="co">#> Error: invalid version specification 'x'</span></span>
|
|||
|
<span id="cb12-3"><a href="#cb12-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb12-4"><a href="#cb12-4" tabindex="-1"></a><span class="fu">c</span>(<span class="st">"x"</span>, <span class="fu">getRversion</span>())</span>
|
|||
|
<span id="cb12-5"><a href="#cb12-5" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb12-6"><a href="#cb12-6" tabindex="-1"></a><span class="co">#> [1] "x"</span></span>
|
|||
|
<span id="cb12-7"><a href="#cb12-7" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb12-8"><a href="#cb12-8" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb12-9"><a href="#cb12-9" tabindex="-1"></a><span class="co">#> [1] 4 3 1</span></span></code></pre></div>
|
|||
|
<p><code>vec_c()</code> throws an error if the inputs are not vectors or
|
|||
|
not automatically coercible:</p>
|
|||
|
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a><span class="fu">vec_c</span>(mean, <span class="fu">globalenv</span>())</span>
|
|||
|
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a><span class="co">#> Error in `vec_c()`:</span></span>
|
|||
|
<span id="cb13-3"><a href="#cb13-3" tabindex="-1"></a><span class="co">#> ! `..1` must be a vector, not a function.</span></span>
|
|||
|
<span id="cb13-4"><a href="#cb13-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb13-5"><a href="#cb13-5" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="fu">Sys.Date</span>(), <span class="fu">factor</span>(<span class="st">"x"</span>), <span class="st">"x"</span>)</span>
|
|||
|
<span id="cb13-6"><a href="#cb13-6" tabindex="-1"></a><span class="co">#> Error in `vec_c()`:</span></span>
|
|||
|
<span id="cb13-7"><a href="#cb13-7" tabindex="-1"></a><span class="co">#> ! Can't combine `..1` <date> and `..2` <factor<bf275>>.</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="factors" class="section level3">
|
|||
|
<h3>Factors</h3>
|
|||
|
<p>Combining two factors returns an integer vector:</p>
|
|||
|
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a>fa <span class="ot"><-</span> <span class="fu">factor</span>(<span class="st">"a"</span>)</span>
|
|||
|
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a>fb <span class="ot"><-</span> <span class="fu">factor</span>(<span class="st">"b"</span>)</span>
|
|||
|
<span id="cb14-3"><a href="#cb14-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb14-4"><a href="#cb14-4" tabindex="-1"></a><span class="fu">c</span>(fa, fb)</span>
|
|||
|
<span id="cb14-5"><a href="#cb14-5" tabindex="-1"></a><span class="co">#> [1] a b</span></span>
|
|||
|
<span id="cb14-6"><a href="#cb14-6" tabindex="-1"></a><span class="co">#> Levels: a b</span></span></code></pre></div>
|
|||
|
<p>(This is documented in <code>c()</code> but is still
|
|||
|
undesirable.)</p>
|
|||
|
<p><code>vec_c()</code> returns a factor taking the union of the levels.
|
|||
|
This behaviour is motivated by pragmatics: there are many places in base
|
|||
|
R that automatically convert character vectors to factors, so enforcing
|
|||
|
stricter behaviour would be unnecessarily onerous. (This is backed up by
|
|||
|
experience with <code>dplyr::bind_rows()</code>, which is stricter and
|
|||
|
is a common source of user difficulty.)</p>
|
|||
|
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a><span class="fu">vec_c</span>(fa, fb)</span>
|
|||
|
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a><span class="co">#> [1] a b</span></span>
|
|||
|
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a><span class="co">#> Levels: a b</span></span>
|
|||
|
<span id="cb15-4"><a href="#cb15-4" tabindex="-1"></a><span class="fu">vec_c</span>(fb, fa)</span>
|
|||
|
<span id="cb15-5"><a href="#cb15-5" tabindex="-1"></a><span class="co">#> [1] b a</span></span>
|
|||
|
<span id="cb15-6"><a href="#cb15-6" tabindex="-1"></a><span class="co">#> Levels: b a</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="date-times" class="section level3">
|
|||
|
<h3>Date-times</h3>
|
|||
|
<p><code>c()</code> strips the time zone associated with date-times:</p>
|
|||
|
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>datetime_nz <span class="ot"><-</span> <span class="fu">as.POSIXct</span>(<span class="st">"2020-01-01 09:00"</span>, <span class="at">tz =</span> <span class="st">"Pacific/Auckland"</span>)</span>
|
|||
|
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a><span class="fu">c</span>(datetime_nz)</span>
|
|||
|
<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 09:00:00 NZDT"</span></span></code></pre></div>
|
|||
|
<p>This behaviour is documented in <code>?DateTimeClasses</code> but is
|
|||
|
the source of considerable user pain.</p>
|
|||
|
<p><code>vec_c()</code> preserves time zones:</p>
|
|||
|
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="fu">vec_c</span>(datetime_nz)</span>
|
|||
|
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 09:00:00 NZDT"</span></span></code></pre></div>
|
|||
|
<p>What time zone should the output have if inputs have different time
|
|||
|
zones? One option would be to be strict and force the user to manually
|
|||
|
align all the time zones. However, this is onerous (particularly because
|
|||
|
there’s no easy way to change the time zone in base R), so vctrs chooses
|
|||
|
to use the first non-local time zone:</p>
|
|||
|
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a>datetime_local <span class="ot"><-</span> <span class="fu">as.POSIXct</span>(<span class="st">"2020-01-01 09:00"</span>)</span>
|
|||
|
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a>datetime_houston <span class="ot"><-</span> <span class="fu">as.POSIXct</span>(<span class="st">"2020-01-01 09:00"</span>, <span class="at">tz =</span> <span class="st">"US/Central"</span>)</span>
|
|||
|
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="fu">vec_c</span>(datetime_local, datetime_houston, datetime_nz)</span>
|
|||
|
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 08:00:00 CST" "2020-01-01 09:00:00 CST"</span></span>
|
|||
|
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a><span class="co">#> [3] "2019-12-31 14:00:00 CST"</span></span>
|
|||
|
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a><span class="fu">vec_c</span>(datetime_houston, datetime_nz)</span>
|
|||
|
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 09:00:00 CST" "2019-12-31 14:00:00 CST"</span></span>
|
|||
|
<span id="cb18-9"><a href="#cb18-9" tabindex="-1"></a><span class="fu">vec_c</span>(datetime_nz, datetime_houston)</span>
|
|||
|
<span id="cb18-10"><a href="#cb18-10" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 09:00:00 NZDT" "2020-01-02 04:00:00 NZDT"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="dates-and-date-times" class="section level3">
|
|||
|
<h3>Dates and date-times</h3>
|
|||
|
<p>Combining dates and date-times with <code>c()</code> gives silently
|
|||
|
incorrect results:</p>
|
|||
|
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a>date <span class="ot"><-</span> <span class="fu">as.Date</span>(<span class="st">"2020-01-01"</span>)</span>
|
|||
|
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>datetime <span class="ot"><-</span> <span class="fu">as.POSIXct</span>(<span class="st">"2020-01-01 09:00"</span>)</span>
|
|||
|
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a><span class="fu">c</span>(date, datetime)</span>
|
|||
|
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a><span class="co">#> [1] "2020-01-01" "2020-01-01"</span></span>
|
|||
|
<span id="cb19-6"><a href="#cb19-6" tabindex="-1"></a><span class="fu">c</span>(datetime, date)</span>
|
|||
|
<span id="cb19-7"><a href="#cb19-7" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 09:00:00 EST" "2019-12-31 19:00:00 EST"</span></span></code></pre></div>
|
|||
|
<p>This behaviour arises because neither <code>c.Date()</code> nor
|
|||
|
<code>c.POSIXct()</code> check that all inputs are of the same type.</p>
|
|||
|
<p><code>vec_c()</code> uses a standard set of rules to avoid this
|
|||
|
problem. When you mix dates and date-times, vctrs returns a date-time
|
|||
|
and converts dates to date-times at midnight (in the timezone of the
|
|||
|
date-time).</p>
|
|||
|
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a><span class="fu">vec_c</span>(date, datetime)</span>
|
|||
|
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 00:00:00 EST" "2020-01-01 09:00:00 EST"</span></span>
|
|||
|
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a><span class="fu">vec_c</span>(date, datetime_nz)</span>
|
|||
|
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a><span class="co">#> [1] "2020-01-01 00:00:00 NZDT" "2020-01-01 09:00:00 NZDT"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="missing-values" class="section level3">
|
|||
|
<h3>Missing values</h3>
|
|||
|
<p>If a missing value comes at the beginning of the inputs,
|
|||
|
<code>c()</code> falls back to the internal behaviour, which strips all
|
|||
|
attributes:</p>
|
|||
|
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">NA</span>, fa)</span>
|
|||
|
<span id="cb21-2"><a href="#cb21-2" tabindex="-1"></a><span class="co">#> [1] NA 1</span></span>
|
|||
|
<span id="cb21-3"><a href="#cb21-3" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">NA</span>, date)</span>
|
|||
|
<span id="cb21-4"><a href="#cb21-4" tabindex="-1"></a><span class="co">#> [1] NA 18262</span></span>
|
|||
|
<span id="cb21-5"><a href="#cb21-5" tabindex="-1"></a><span class="fu">c</span>(<span class="cn">NA</span>, datetime)</span>
|
|||
|
<span id="cb21-6"><a href="#cb21-6" tabindex="-1"></a><span class="co">#> [1] NA 1577887200</span></span></code></pre></div>
|
|||
|
<p><code>vec_c()</code> takes a different approach treating a logical
|
|||
|
vector consisting only of <code>NA</code> as the
|
|||
|
<code>unspecified()</code> class which can be converted to any other 1d
|
|||
|
type:</p>
|
|||
|
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">NA</span>, fa)</span>
|
|||
|
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a><span class="co">#> [1] <NA> a </span></span>
|
|||
|
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="co">#> Levels: a</span></span>
|
|||
|
<span id="cb22-4"><a href="#cb22-4" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">NA</span>, date)</span>
|
|||
|
<span id="cb22-5"><a href="#cb22-5" tabindex="-1"></a><span class="co">#> [1] NA "2020-01-01"</span></span>
|
|||
|
<span id="cb22-6"><a href="#cb22-6" tabindex="-1"></a><span class="fu">vec_c</span>(<span class="cn">NA</span>, datetime)</span>
|
|||
|
<span id="cb22-7"><a href="#cb22-7" tabindex="-1"></a><span class="co">#> [1] NA "2020-01-01 09:00:00 EST"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="data-frames" class="section level3">
|
|||
|
<h3>Data frames</h3>
|
|||
|
<p>Because it is <em>almost always</em> length-stable, <code>c()</code>
|
|||
|
combines data frames column wise (into a list):</p>
|
|||
|
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a>df1 <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">x =</span> <span class="dv">1</span>)</span>
|
|||
|
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a>df2 <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">x =</span> <span class="dv">2</span>)</span>
|
|||
|
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">c</span>(df1, df1))</span>
|
|||
|
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a><span class="co">#> List of 2</span></span>
|
|||
|
<span id="cb23-5"><a href="#cb23-5" tabindex="-1"></a><span class="co">#> $ x: num 1</span></span>
|
|||
|
<span id="cb23-6"><a href="#cb23-6" tabindex="-1"></a><span class="co">#> $ x: num 1</span></span></code></pre></div>
|
|||
|
<p><code>vec_c()</code> is size-stable, which implies it will row-bind
|
|||
|
data frames:</p>
|
|||
|
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">vec_c</span>(df1, df2)</span>
|
|||
|
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="co">#> x</span></span>
|
|||
|
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a><span class="co">#> 1 1</span></span>
|
|||
|
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a><span class="co">#> 2 2</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="matrices-and-arrays" class="section level3">
|
|||
|
<h3>Matrices and arrays</h3>
|
|||
|
<p>The same reasoning applies to matrices:</p>
|
|||
|
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a>m <span class="ot"><-</span> <span class="fu">matrix</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">4</span>, <span class="at">nrow =</span> <span class="dv">2</span>)</span>
|
|||
|
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a><span class="fu">c</span>(m, m)</span>
|
|||
|
<span id="cb25-3"><a href="#cb25-3" tabindex="-1"></a><span class="co">#> [1] 1 2 3 4 1 2 3 4</span></span>
|
|||
|
<span id="cb25-4"><a href="#cb25-4" tabindex="-1"></a><span class="fu">vec_c</span>(m, m)</span>
|
|||
|
<span id="cb25-5"><a href="#cb25-5" tabindex="-1"></a><span class="co">#> [,1] [,2]</span></span>
|
|||
|
<span id="cb25-6"><a href="#cb25-6" tabindex="-1"></a><span class="co">#> [1,] 1 3</span></span>
|
|||
|
<span id="cb25-7"><a href="#cb25-7" tabindex="-1"></a><span class="co">#> [2,] 2 4</span></span>
|
|||
|
<span id="cb25-8"><a href="#cb25-8" tabindex="-1"></a><span class="co">#> [3,] 1 3</span></span>
|
|||
|
<span id="cb25-9"><a href="#cb25-9" tabindex="-1"></a><span class="co">#> [4,] 2 4</span></span></code></pre></div>
|
|||
|
<p>One difference is that <code>vec_c()</code> will “broadcast” a vector
|
|||
|
to match the dimensions of a matrix:</p>
|
|||
|
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" tabindex="-1"></a><span class="fu">c</span>(m, <span class="dv">1</span>)</span>
|
|||
|
<span id="cb26-2"><a href="#cb26-2" tabindex="-1"></a><span class="co">#> [1] 1 2 3 4 1</span></span>
|
|||
|
<span id="cb26-3"><a href="#cb26-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb26-4"><a href="#cb26-4" tabindex="-1"></a><span class="fu">vec_c</span>(m, <span class="dv">1</span>)</span>
|
|||
|
<span id="cb26-5"><a href="#cb26-5" tabindex="-1"></a><span class="co">#> [,1] [,2]</span></span>
|
|||
|
<span id="cb26-6"><a href="#cb26-6" tabindex="-1"></a><span class="co">#> [1,] 1 3</span></span>
|
|||
|
<span id="cb26-7"><a href="#cb26-7" tabindex="-1"></a><span class="co">#> [2,] 2 4</span></span>
|
|||
|
<span id="cb26-8"><a href="#cb26-8" tabindex="-1"></a><span class="co">#> [3,] 1 1</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="implementation" class="section level3">
|
|||
|
<h3>Implementation</h3>
|
|||
|
<p>The basic implementation of <code>vec_c()</code> is reasonably
|
|||
|
simple. We first figure out the properties of the output, i.e. the
|
|||
|
common type and total size, and then allocate it with
|
|||
|
<code>vec_init()</code>, and then insert each input into the correct
|
|||
|
place in the output.</p>
|
|||
|
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a>vec_c <span class="ot"><-</span> <span class="cf">function</span>(...) {</span>
|
|||
|
<span id="cb27-2"><a href="#cb27-2" tabindex="-1"></a> args <span class="ot"><-</span> <span class="fu">compact</span>(<span class="fu">list2</span>(...))</span>
|
|||
|
<span id="cb27-3"><a href="#cb27-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb27-4"><a href="#cb27-4" tabindex="-1"></a> ptype <span class="ot"><-</span> <span class="fu">vec_ptype_common</span>(<span class="sc">!!!</span>args)</span>
|
|||
|
<span id="cb27-5"><a href="#cb27-5" tabindex="-1"></a> <span class="cf">if</span> (<span class="fu">is.null</span>(ptype))</span>
|
|||
|
<span id="cb27-6"><a href="#cb27-6" tabindex="-1"></a> <span class="fu">return</span>(<span class="cn">NULL</span>)</span>
|
|||
|
<span id="cb27-7"><a href="#cb27-7" tabindex="-1"></a></span>
|
|||
|
<span id="cb27-8"><a href="#cb27-8" tabindex="-1"></a> ns <span class="ot"><-</span> <span class="fu">map_int</span>(args, vec_size)</span>
|
|||
|
<span id="cb27-9"><a href="#cb27-9" tabindex="-1"></a> out <span class="ot"><-</span> <span class="fu">vec_init</span>(ptype, <span class="fu">sum</span>(ns))</span>
|
|||
|
<span id="cb27-10"><a href="#cb27-10" tabindex="-1"></a></span>
|
|||
|
<span id="cb27-11"><a href="#cb27-11" tabindex="-1"></a> pos <span class="ot"><-</span> <span class="dv">1</span></span>
|
|||
|
<span id="cb27-12"><a href="#cb27-12" tabindex="-1"></a> <span class="cf">for</span> (i <span class="cf">in</span> <span class="fu">seq_along</span>(ns)) {</span>
|
|||
|
<span id="cb27-13"><a href="#cb27-13" tabindex="-1"></a> n <span class="ot"><-</span> ns[[i]]</span>
|
|||
|
<span id="cb27-14"><a href="#cb27-14" tabindex="-1"></a> </span>
|
|||
|
<span id="cb27-15"><a href="#cb27-15" tabindex="-1"></a> x <span class="ot"><-</span> <span class="fu">vec_cast</span>(args[[i]], <span class="at">to =</span> ptype)</span>
|
|||
|
<span id="cb27-16"><a href="#cb27-16" tabindex="-1"></a> <span class="fu">vec_slice</span>(out, pos<span class="sc">:</span>(pos <span class="sc">+</span> n <span class="sc">-</span> <span class="dv">1</span>)) <span class="ot"><-</span> x</span>
|
|||
|
<span id="cb27-17"><a href="#cb27-17" tabindex="-1"></a> pos <span class="ot"><-</span> pos <span class="sc">+</span> n</span>
|
|||
|
<span id="cb27-18"><a href="#cb27-18" tabindex="-1"></a> }</span>
|
|||
|
<span id="cb27-19"><a href="#cb27-19" tabindex="-1"></a></span>
|
|||
|
<span id="cb27-20"><a href="#cb27-20" tabindex="-1"></a> out</span>
|
|||
|
<span id="cb27-21"><a href="#cb27-21" tabindex="-1"></a>}</span></code></pre></div>
|
|||
|
<p>(The real <code>vec_c()</code> is a bit more complicated in order to
|
|||
|
handle inner and outer names).</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div id="ifelse" class="section level2">
|
|||
|
<h2><code>ifelse()</code></h2>
|
|||
|
<p>One of the functions that motivate the development of vctrs is
|
|||
|
<code>ifelse()</code>. It has the surprising property that the result
|
|||
|
value is “A vector of the same length and attributes (including
|
|||
|
dimensions and class) as <code>test</code>”. To me, it seems more
|
|||
|
reasonable for the type of the output to be controlled by the type of
|
|||
|
the <code>yes</code> and <code>no</code> arguments.</p>
|
|||
|
<p>In <code>dplyr::if_else()</code> I swung too far towards strictness:
|
|||
|
it throws an error if <code>yes</code> and <code>no</code> are not the
|
|||
|
same type. This is annoying in practice because it requires typed
|
|||
|
missing values (<code>NA_character_</code> etc), and because the checks
|
|||
|
are only on the class (not the full prototype), it’s easy to create
|
|||
|
invalid output.</p>
|
|||
|
<p>I found it much easier to understand what <code>ifelse()</code>
|
|||
|
<em>should</em> do once I internalised the ideas of type- and
|
|||
|
size-stability:</p>
|
|||
|
<ul>
|
|||
|
<li><p>The first argument must be logical.</p></li>
|
|||
|
<li><p><code>vec_ptype(if_else(test, yes, no))</code> equals
|
|||
|
<code>vec_ptype_common(yes, no)</code>. Unlike <code>ifelse()</code>
|
|||
|
this implies that <code>if_else()</code> must always evaluate both
|
|||
|
<code>yes</code> and <code>no</code> in order to figure out the correct
|
|||
|
type. I think this is consistent with <code>&&</code> (scalar
|
|||
|
operation, short circuits) and <code>&</code> (vectorised, evaluates
|
|||
|
both sides).</p></li>
|
|||
|
<li><p><code>vec_size(if_else(test, yes, no))</code> equals
|
|||
|
<code>vec_size_common(test, yes, no)</code>. I think the output could
|
|||
|
have the same size as <code>test</code> (i.e., the same behaviour as
|
|||
|
<code>ifelse</code>), but I <em>think</em> as a general rule that your
|
|||
|
inputs should either be mutually recycling or not.</p></li>
|
|||
|
</ul>
|
|||
|
<p>This leads to the following implementation:</p>
|
|||
|
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a>if_else <span class="ot"><-</span> <span class="cf">function</span>(test, yes, no) {</span>
|
|||
|
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a> <span class="cf">if</span> (<span class="sc">!</span><span class="fu">is_logical</span>(test)) {</span>
|
|||
|
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a> <span class="fu">abort</span>(<span class="st">"`test` must be a logical vector."</span>)</span>
|
|||
|
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a> }</span>
|
|||
|
<span id="cb28-5"><a href="#cb28-5" tabindex="-1"></a> </span>
|
|||
|
<span id="cb28-6"><a href="#cb28-6" tabindex="-1"></a> <span class="fu">c</span>(yes, no) <span class="sc">%<-%</span> <span class="fu">vec_cast_common</span>(yes, no)</span>
|
|||
|
<span id="cb28-7"><a href="#cb28-7" tabindex="-1"></a> <span class="fu">c</span>(test, yes, no) <span class="sc">%<-%</span> <span class="fu">vec_recycle_common</span>(test, yes, no)</span>
|
|||
|
<span id="cb28-8"><a href="#cb28-8" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-9"><a href="#cb28-9" tabindex="-1"></a> out <span class="ot"><-</span> <span class="fu">vec_init</span>(yes, <span class="fu">vec_size</span>(yes))</span>
|
|||
|
<span id="cb28-10"><a href="#cb28-10" tabindex="-1"></a> <span class="fu">vec_slice</span>(out, test) <span class="ot"><-</span> <span class="fu">vec_slice</span>(yes, test)</span>
|
|||
|
<span id="cb28-11"><a href="#cb28-11" tabindex="-1"></a> <span class="fu">vec_slice</span>(out, <span class="sc">!</span>test) <span class="ot"><-</span> <span class="fu">vec_slice</span>(no, <span class="sc">!</span>test)</span>
|
|||
|
<span id="cb28-12"><a href="#cb28-12" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-13"><a href="#cb28-13" tabindex="-1"></a> out</span>
|
|||
|
<span id="cb28-14"><a href="#cb28-14" tabindex="-1"></a>}</span>
|
|||
|
<span id="cb28-15"><a href="#cb28-15" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-16"><a href="#cb28-16" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="cn">NA</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">4</span>)</span>
|
|||
|
<span id="cb28-17"><a href="#cb28-17" tabindex="-1"></a><span class="fu">if_else</span>(x <span class="sc">></span> <span class="dv">2</span>, <span class="st">"small"</span>, <span class="st">"big"</span>)</span>
|
|||
|
<span id="cb28-18"><a href="#cb28-18" tabindex="-1"></a><span class="co">#> [1] NA "big" "big" "small" "small"</span></span>
|
|||
|
<span id="cb28-19"><a href="#cb28-19" tabindex="-1"></a><span class="fu">if_else</span>(x <span class="sc">></span> <span class="dv">2</span>, <span class="fu">factor</span>(<span class="st">"small"</span>), <span class="fu">factor</span>(<span class="st">"big"</span>))</span>
|
|||
|
<span id="cb28-20"><a href="#cb28-20" tabindex="-1"></a><span class="co">#> [1] <NA> big big small small</span></span>
|
|||
|
<span id="cb28-21"><a href="#cb28-21" tabindex="-1"></a><span class="co">#> Levels: small big</span></span>
|
|||
|
<span id="cb28-22"><a href="#cb28-22" tabindex="-1"></a><span class="fu">if_else</span>(x <span class="sc">></span> <span class="dv">2</span>, <span class="fu">Sys.Date</span>(), <span class="fu">Sys.Date</span>() <span class="sc">+</span> <span class="dv">7</span>)</span>
|
|||
|
<span id="cb28-23"><a href="#cb28-23" tabindex="-1"></a><span class="co">#> [1] NA "2023-12-08" "2023-12-08" "2023-12-01" "2023-12-01"</span></span></code></pre></div>
|
|||
|
<p>By using <code>vec_size()</code> and <code>vec_slice()</code>, this
|
|||
|
definition of <code>if_else()</code> automatically works with
|
|||
|
data.frames and matrices:</p>
|
|||
|
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" tabindex="-1"></a><span class="fu">if_else</span>(x <span class="sc">></span> <span class="dv">2</span>, <span class="fu">data.frame</span>(<span class="at">x =</span> <span class="dv">1</span>), <span class="fu">data.frame</span>(<span class="at">y =</span> <span class="dv">2</span>))</span>
|
|||
|
<span id="cb29-2"><a href="#cb29-2" tabindex="-1"></a><span class="co">#> x y</span></span>
|
|||
|
<span id="cb29-3"><a href="#cb29-3" tabindex="-1"></a><span class="co">#> 1 NA NA</span></span>
|
|||
|
<span id="cb29-4"><a href="#cb29-4" tabindex="-1"></a><span class="co">#> 2 NA 2</span></span>
|
|||
|
<span id="cb29-5"><a href="#cb29-5" tabindex="-1"></a><span class="co">#> 3 NA 2</span></span>
|
|||
|
<span id="cb29-6"><a href="#cb29-6" tabindex="-1"></a><span class="co">#> 4 1 NA</span></span>
|
|||
|
<span id="cb29-7"><a href="#cb29-7" tabindex="-1"></a><span class="co">#> 5 1 NA</span></span>
|
|||
|
<span id="cb29-8"><a href="#cb29-8" tabindex="-1"></a></span>
|
|||
|
<span id="cb29-9"><a href="#cb29-9" tabindex="-1"></a><span class="fu">if_else</span>(x <span class="sc">></span> <span class="dv">2</span>, <span class="fu">matrix</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">10</span>, <span class="at">ncol =</span> <span class="dv">2</span>), <span class="fu">cbind</span>(<span class="dv">30</span>, <span class="dv">30</span>))</span>
|
|||
|
<span id="cb29-10"><a href="#cb29-10" tabindex="-1"></a><span class="co">#> [,1] [,2]</span></span>
|
|||
|
<span id="cb29-11"><a href="#cb29-11" tabindex="-1"></a><span class="co">#> [1,] NA NA</span></span>
|
|||
|
<span id="cb29-12"><a href="#cb29-12" tabindex="-1"></a><span class="co">#> [2,] 30 30</span></span>
|
|||
|
<span id="cb29-13"><a href="#cb29-13" tabindex="-1"></a><span class="co">#> [3,] 30 30</span></span>
|
|||
|
<span id="cb29-14"><a href="#cb29-14" tabindex="-1"></a><span class="co">#> [4,] 4 9</span></span>
|
|||
|
<span id="cb29-15"><a href="#cb29-15" tabindex="-1"></a><span class="co">#> [5,] 5 10</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<!-- code folding -->
|
|||
|
|
|||
|
|
|||
|
<!-- dynamically load mathjax for compatibility with self-contained -->
|
|||
|
<script>
|
|||
|
(function () {
|
|||
|
var script = document.createElement("script");
|
|||
|
script.type = "text/javascript";
|
|||
|
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
|||
|
document.getElementsByTagName("head")[0].appendChild(script);
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|