719 lines
44 KiB
HTML
719 lines
44 KiB
HTML
|
<!DOCTYPE html>
|
|||
|
|
|||
|
<html>
|
|||
|
|
|||
|
<head>
|
|||
|
|
|||
|
<meta charset="utf-8" />
|
|||
|
<meta name="generator" content="pandoc" />
|
|||
|
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
|||
|
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<title>Introduction to stringr</title>
|
|||
|
|
|||
|
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
|||
|
// be compatible with the behavior of Pandoc < 2.8).
|
|||
|
document.addEventListener('DOMContentLoaded', function(e) {
|
|||
|
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
|||
|
var i, h, a;
|
|||
|
for (i = 0; i < hs.length; i++) {
|
|||
|
h = hs[i];
|
|||
|
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
|||
|
a = h.attributes;
|
|||
|
while (a.length > 0) h.removeAttribute(a[0].name);
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code{white-space: pre-wrap;}
|
|||
|
span.smallcaps{font-variant: small-caps;}
|
|||
|
span.underline{text-decoration: underline;}
|
|||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
|||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
|||
|
ul.task-list{list-style: none;}
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code {
|
|||
|
white-space: pre;
|
|||
|
}
|
|||
|
.sourceCode {
|
|||
|
overflow: visible;
|
|||
|
}
|
|||
|
</style>
|
|||
|
<style type="text/css" data-origin="pandoc">
|
|||
|
pre > code.sourceCode { white-space: pre; position: relative; }
|
|||
|
pre > code.sourceCode > span { line-height: 1.25; }
|
|||
|
pre > code.sourceCode > span:empty { height: 1.2em; }
|
|||
|
.sourceCode { overflow: visible; }
|
|||
|
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
|||
|
div.sourceCode { margin: 1em 0; }
|
|||
|
pre.sourceCode { margin: 0; }
|
|||
|
@media screen {
|
|||
|
div.sourceCode { overflow: auto; }
|
|||
|
}
|
|||
|
@media print {
|
|||
|
pre > code.sourceCode { white-space: pre-wrap; }
|
|||
|
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
|||
|
}
|
|||
|
pre.numberSource code
|
|||
|
{ counter-reset: source-line 0; }
|
|||
|
pre.numberSource code > span
|
|||
|
{ position: relative; left: -4em; counter-increment: source-line; }
|
|||
|
pre.numberSource code > span > a:first-child::before
|
|||
|
{ content: counter(source-line);
|
|||
|
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
|||
|
border: none; display: inline-block;
|
|||
|
-webkit-touch-callout: none; -webkit-user-select: none;
|
|||
|
-khtml-user-select: none; -moz-user-select: none;
|
|||
|
-ms-user-select: none; user-select: none;
|
|||
|
padding: 0 4px; width: 4em;
|
|||
|
color: #aaaaaa;
|
|||
|
}
|
|||
|
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
|||
|
div.sourceCode
|
|||
|
{ }
|
|||
|
@media screen {
|
|||
|
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
|||
|
}
|
|||
|
code span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.at { color: #7d9029; }
|
|||
|
code span.bn { color: #40a070; }
|
|||
|
code span.bu { color: #008000; }
|
|||
|
code span.cf { color: #007020; font-weight: bold; }
|
|||
|
code span.ch { color: #4070a0; }
|
|||
|
code span.cn { color: #880000; }
|
|||
|
code span.co { color: #60a0b0; font-style: italic; }
|
|||
|
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.do { color: #ba2121; font-style: italic; }
|
|||
|
code span.dt { color: #902000; }
|
|||
|
code span.dv { color: #40a070; }
|
|||
|
code span.er { color: #ff0000; font-weight: bold; }
|
|||
|
code span.ex { }
|
|||
|
code span.fl { color: #40a070; }
|
|||
|
code span.fu { color: #06287e; }
|
|||
|
code span.im { color: #008000; font-weight: bold; }
|
|||
|
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.kw { color: #007020; font-weight: bold; }
|
|||
|
code span.op { color: #666666; }
|
|||
|
code span.ot { color: #007020; }
|
|||
|
code span.pp { color: #bc7a00; }
|
|||
|
code span.sc { color: #4070a0; }
|
|||
|
code span.ss { color: #bb6688; }
|
|||
|
code span.st { color: #4070a0; }
|
|||
|
code span.va { color: #19177c; }
|
|||
|
code span.vs { color: #4070a0; }
|
|||
|
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
</style>
|
|||
|
<script>
|
|||
|
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
|||
|
(function() {
|
|||
|
var sheets = document.styleSheets;
|
|||
|
for (var i = 0; i < sheets.length; i++) {
|
|||
|
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
|||
|
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
|||
|
var j = 0;
|
|||
|
while (j < rules.length) {
|
|||
|
var rule = rules[j];
|
|||
|
// check if there is a div.sourceCode rule
|
|||
|
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
var style = rule.style.cssText;
|
|||
|
// check if color or background-color is set
|
|||
|
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
// replace div.sourceCode by a pre.sourceCode rule
|
|||
|
sheets[i].deleteRule(j);
|
|||
|
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
|||
|
}
|
|||
|
}
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">body {
|
|||
|
background-color: #fff;
|
|||
|
margin: 1em auto;
|
|||
|
max-width: 700px;
|
|||
|
overflow: visible;
|
|||
|
padding-left: 2em;
|
|||
|
padding-right: 2em;
|
|||
|
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
|||
|
font-size: 14px;
|
|||
|
line-height: 1.35;
|
|||
|
}
|
|||
|
#TOC {
|
|||
|
clear: both;
|
|||
|
margin: 0 0 10px 10px;
|
|||
|
padding: 4px;
|
|||
|
width: 400px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
border-radius: 5px;
|
|||
|
background-color: #f6f6f6;
|
|||
|
font-size: 13px;
|
|||
|
line-height: 1.3;
|
|||
|
}
|
|||
|
#TOC .toctitle {
|
|||
|
font-weight: bold;
|
|||
|
font-size: 15px;
|
|||
|
margin-left: 5px;
|
|||
|
}
|
|||
|
#TOC ul {
|
|||
|
padding-left: 40px;
|
|||
|
margin-left: -1.5em;
|
|||
|
margin-top: 5px;
|
|||
|
margin-bottom: 5px;
|
|||
|
}
|
|||
|
#TOC ul ul {
|
|||
|
margin-left: -2em;
|
|||
|
}
|
|||
|
#TOC li {
|
|||
|
line-height: 16px;
|
|||
|
}
|
|||
|
table {
|
|||
|
margin: 1em auto;
|
|||
|
border-width: 1px;
|
|||
|
border-color: #DDDDDD;
|
|||
|
border-style: outset;
|
|||
|
border-collapse: collapse;
|
|||
|
}
|
|||
|
table th {
|
|||
|
border-width: 2px;
|
|||
|
padding: 5px;
|
|||
|
border-style: inset;
|
|||
|
}
|
|||
|
table td {
|
|||
|
border-width: 1px;
|
|||
|
border-style: inset;
|
|||
|
line-height: 18px;
|
|||
|
padding: 5px 5px;
|
|||
|
}
|
|||
|
table, table th, table td {
|
|||
|
border-left-style: none;
|
|||
|
border-right-style: none;
|
|||
|
}
|
|||
|
table thead, table tr.even {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
p {
|
|||
|
margin: 0.5em 0;
|
|||
|
}
|
|||
|
blockquote {
|
|||
|
background-color: #f6f6f6;
|
|||
|
padding: 0.25em 0.75em;
|
|||
|
}
|
|||
|
hr {
|
|||
|
border-style: solid;
|
|||
|
border: none;
|
|||
|
border-top: 1px solid #777;
|
|||
|
margin: 28px 0;
|
|||
|
}
|
|||
|
dl {
|
|||
|
margin-left: 0;
|
|||
|
}
|
|||
|
dl dd {
|
|||
|
margin-bottom: 13px;
|
|||
|
margin-left: 13px;
|
|||
|
}
|
|||
|
dl dt {
|
|||
|
font-weight: bold;
|
|||
|
}
|
|||
|
ul {
|
|||
|
margin-top: 0;
|
|||
|
}
|
|||
|
ul li {
|
|||
|
list-style: circle outside;
|
|||
|
}
|
|||
|
ul ul {
|
|||
|
margin-bottom: 0;
|
|||
|
}
|
|||
|
pre, code {
|
|||
|
background-color: #f7f7f7;
|
|||
|
border-radius: 3px;
|
|||
|
color: #333;
|
|||
|
white-space: pre-wrap;
|
|||
|
}
|
|||
|
pre {
|
|||
|
border-radius: 3px;
|
|||
|
margin: 5px 0px 10px 0px;
|
|||
|
padding: 10px;
|
|||
|
}
|
|||
|
pre:not([class]) {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
code {
|
|||
|
font-family: Consolas, Monaco, 'Courier New', monospace;
|
|||
|
font-size: 85%;
|
|||
|
}
|
|||
|
p > code, li > code {
|
|||
|
padding: 2px 0px;
|
|||
|
}
|
|||
|
div.figure {
|
|||
|
text-align: center;
|
|||
|
}
|
|||
|
img {
|
|||
|
background-color: #FFFFFF;
|
|||
|
padding: 2px;
|
|||
|
border: 1px solid #DDDDDD;
|
|||
|
border-radius: 3px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
margin: 0 5px;
|
|||
|
}
|
|||
|
h1 {
|
|||
|
margin-top: 0;
|
|||
|
font-size: 35px;
|
|||
|
line-height: 40px;
|
|||
|
}
|
|||
|
h2 {
|
|||
|
border-bottom: 4px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
padding-bottom: 2px;
|
|||
|
font-size: 145%;
|
|||
|
}
|
|||
|
h3 {
|
|||
|
border-bottom: 2px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
font-size: 120%;
|
|||
|
}
|
|||
|
h4 {
|
|||
|
border-bottom: 1px solid #f7f7f7;
|
|||
|
margin-left: 8px;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
h5, h6 {
|
|||
|
border-bottom: 1px solid #ccc;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
a {
|
|||
|
color: #0033dd;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
a:hover {
|
|||
|
color: #6666ff; }
|
|||
|
a:visited {
|
|||
|
color: #800080; }
|
|||
|
a:visited:hover {
|
|||
|
color: #BB00BB; }
|
|||
|
a[href^="http:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
a[href^="https:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
|
|||
|
code > span.kw { color: #555; font-weight: bold; }
|
|||
|
code > span.dt { color: #902000; }
|
|||
|
code > span.dv { color: #40a070; }
|
|||
|
code > span.bn { color: #d14; }
|
|||
|
code > span.fl { color: #d14; }
|
|||
|
code > span.ch { color: #d14; }
|
|||
|
code > span.st { color: #d14; }
|
|||
|
code > span.co { color: #888888; font-style: italic; }
|
|||
|
code > span.ot { color: #007020; }
|
|||
|
code > span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code > span.fu { color: #900; font-weight: bold; }
|
|||
|
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</head>
|
|||
|
|
|||
|
<body>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<h1 class="title toc-ignore">Introduction to stringr</h1>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<p>There are four main families of functions in stringr:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li><p>Character manipulation: these functions allow you to manipulate
|
|||
|
individual characters within the strings in character vectors.</p></li>
|
|||
|
<li><p>Whitespace tools to add, remove, and manipulate
|
|||
|
whitespace.</p></li>
|
|||
|
<li><p>Locale sensitive operations whose operations will vary from
|
|||
|
locale to locale.</p></li>
|
|||
|
<li><p>Pattern matching functions. These recognise four engines of
|
|||
|
pattern description. The most common is regular expressions, but there
|
|||
|
are three other tools.</p></li>
|
|||
|
</ol>
|
|||
|
<div id="getting-and-setting-individual-characters" class="section level2">
|
|||
|
<h2>Getting and setting individual characters</h2>
|
|||
|
<p>You can get the length of the string with
|
|||
|
<code>str_length()</code>:</p>
|
|||
|
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="fu">str_length</span>(<span class="st">"abc"</span>)</span>
|
|||
|
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="co">#> [1] 3</span></span></code></pre></div>
|
|||
|
<p>This is now equivalent to the base R function <code>nchar()</code>.
|
|||
|
Previously it was needed to work around issues with <code>nchar()</code>
|
|||
|
such as the fact that it returned 2 for <code>nchar(NA)</code>. This has
|
|||
|
been fixed as of R 3.3.0, so it is no longer so important.</p>
|
|||
|
<p>You can access individual character using <code>str_sub()</code>. It
|
|||
|
takes three arguments: a character vector, a <code>start</code> position
|
|||
|
and an <code>end</code> position. Either position can either be a
|
|||
|
positive integer, which counts from the left, or a negative integer
|
|||
|
which counts from the right. The positions are inclusive, and if longer
|
|||
|
than the string, will be silently truncated.</p>
|
|||
|
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"abcdef"</span>, <span class="st">"ghifjk"</span>)</span>
|
|||
|
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="co"># The 3rd letter</span></span>
|
|||
|
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">3</span>, <span class="dv">3</span>)</span>
|
|||
|
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a><span class="co">#> [1] "c" "i"</span></span>
|
|||
|
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a></span>
|
|||
|
<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="co"># The 2nd to 2nd-to-last character</span></span>
|
|||
|
<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">2</span>, <span class="sc">-</span><span class="dv">2</span>)</span>
|
|||
|
<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a><span class="co">#> [1] "bcde" "hifj"</span></span></code></pre></div>
|
|||
|
<p>You can also use <code>str_sub()</code> to modify strings:</p>
|
|||
|
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">3</span>, <span class="dv">3</span>) <span class="ot"><-</span> <span class="st">"X"</span></span>
|
|||
|
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a>x</span>
|
|||
|
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="co">#> [1] "abXdef" "ghXfjk"</span></span></code></pre></div>
|
|||
|
<p>To duplicate individual strings, you can use
|
|||
|
<code>str_dup()</code>:</p>
|
|||
|
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">str_dup</span>(x, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">3</span>))</span>
|
|||
|
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="co">#> [1] "abXdefabXdef" "ghXfjkghXfjkghXfjk"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="whitespace" class="section level2">
|
|||
|
<h2>Whitespace</h2>
|
|||
|
<p>Three functions add, remove, or modify whitespace:</p>
|
|||
|
<ol style="list-style-type: decimal">
|
|||
|
<li><p><code>str_pad()</code> pads a string to a fixed length by adding
|
|||
|
extra whitespace on the left, right, or both sides.</p>
|
|||
|
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"abc"</span>, <span class="st">"defghi"</span>)</span>
|
|||
|
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">10</span>) <span class="co"># default pads on left</span></span>
|
|||
|
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a><span class="co">#> [1] " abc" " defghi"</span></span>
|
|||
|
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">10</span>, <span class="st">"both"</span>)</span>
|
|||
|
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="co">#> [1] " abc " " defghi "</span></span></code></pre></div>
|
|||
|
<p>(You can pad with other characters by using the <code>pad</code>
|
|||
|
argument.)</p>
|
|||
|
<p><code>str_pad()</code> will never make a string shorter:</p>
|
|||
|
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">4</span>)</span>
|
|||
|
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co">#> [1] " abc" "defghi"</span></span></code></pre></div>
|
|||
|
<p>So if you want to ensure that all strings are the same length (often
|
|||
|
useful for print methods), combine <code>str_pad()</code> and
|
|||
|
<code>str_trunc()</code>:</p>
|
|||
|
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"Short"</span>, <span class="st">"This is a long string"</span>)</span>
|
|||
|
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a></span>
|
|||
|
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a>x <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a> <span class="fu">str_trunc</span>(<span class="dv">10</span>) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a> <span class="fu">str_pad</span>(<span class="dv">10</span>, <span class="st">"right"</span>)</span>
|
|||
|
<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a><span class="co">#> [1] "Short " "This is..."</span></span></code></pre></div></li>
|
|||
|
<li><p>The opposite of <code>str_pad()</code> is
|
|||
|
<code>str_trim()</code>, which removes leading and trailing
|
|||
|
whitespace:</p>
|
|||
|
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">" a "</span>, <span class="st">"b "</span>, <span class="st">" c"</span>)</span>
|
|||
|
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a><span class="fu">str_trim</span>(x)</span>
|
|||
|
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a><span class="co">#> [1] "a" "b" "c"</span></span>
|
|||
|
<span id="cb8-4"><a href="#cb8-4" tabindex="-1"></a><span class="fu">str_trim</span>(x, <span class="st">"left"</span>)</span>
|
|||
|
<span id="cb8-5"><a href="#cb8-5" tabindex="-1"></a><span class="co">#> [1] "a " "b " "c"</span></span></code></pre></div></li>
|
|||
|
<li><p>You can use <code>str_wrap()</code> to modify existing whitespace
|
|||
|
in order to wrap a paragraph of text, such that the length of each line
|
|||
|
is as similar as possible.</p>
|
|||
|
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>jabberwocky <span class="ot"><-</span> <span class="fu">str_c</span>(</span>
|
|||
|
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a> <span class="st">"`Twas brillig, and the slithy toves "</span>,</span>
|
|||
|
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a> <span class="st">"did gyre and gimble in the wabe: "</span>,</span>
|
|||
|
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a> <span class="st">"All mimsy were the borogoves, "</span>,</span>
|
|||
|
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a> <span class="st">"and the mome raths outgrabe. "</span></span>
|
|||
|
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a>)</span>
|
|||
|
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a><span class="fu">cat</span>(<span class="fu">str_wrap</span>(jabberwocky, <span class="at">width =</span> <span class="dv">40</span>))</span>
|
|||
|
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a><span class="co">#> `Twas brillig, and the slithy toves did</span></span>
|
|||
|
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a><span class="co">#> gyre and gimble in the wabe: All mimsy</span></span>
|
|||
|
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a><span class="co">#> were the borogoves, and the mome raths</span></span>
|
|||
|
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a><span class="co">#> outgrabe.</span></span></code></pre></div></li>
|
|||
|
</ol>
|
|||
|
</div>
|
|||
|
<div id="locale-sensitive" class="section level2">
|
|||
|
<h2>Locale sensitive</h2>
|
|||
|
<p>A handful of stringr functions are locale-sensitive: they will
|
|||
|
perform differently in different regions of the world. These functions
|
|||
|
are case transformation functions:</p>
|
|||
|
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"I like horses."</span></span>
|
|||
|
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a><span class="fu">str_to_upper</span>(x)</span>
|
|||
|
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a><span class="co">#> [1] "I LIKE HORSES."</span></span>
|
|||
|
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a><span class="fu">str_to_title</span>(x)</span>
|
|||
|
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a><span class="co">#> [1] "I Like Horses."</span></span>
|
|||
|
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a></span>
|
|||
|
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a><span class="fu">str_to_lower</span>(x)</span>
|
|||
|
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a><span class="co">#> [1] "i like horses."</span></span>
|
|||
|
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a><span class="co"># Turkish has two sorts of i: with and without the dot</span></span>
|
|||
|
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a><span class="fu">str_to_lower</span>(x, <span class="st">"tr"</span>)</span>
|
|||
|
<span id="cb10-11"><a href="#cb10-11" tabindex="-1"></a><span class="co">#> [1] "ı like horses."</span></span></code></pre></div>
|
|||
|
<p>String ordering and sorting:</p>
|
|||
|
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"y"</span>, <span class="st">"i"</span>, <span class="st">"k"</span>)</span>
|
|||
|
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a><span class="fu">str_order</span>(x)</span>
|
|||
|
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a><span class="co">#> [1] 2 3 1</span></span>
|
|||
|
<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a><span class="fu">str_sort</span>(x)</span>
|
|||
|
<span id="cb11-6"><a href="#cb11-6" tabindex="-1"></a><span class="co">#> [1] "i" "k" "y"</span></span>
|
|||
|
<span id="cb11-7"><a href="#cb11-7" tabindex="-1"></a><span class="co"># In Lithuanian, y comes between i and k</span></span>
|
|||
|
<span id="cb11-8"><a href="#cb11-8" tabindex="-1"></a><span class="fu">str_sort</span>(x, <span class="at">locale =</span> <span class="st">"lt"</span>)</span>
|
|||
|
<span id="cb11-9"><a href="#cb11-9" tabindex="-1"></a><span class="co">#> [1] "i" "y" "k"</span></span></code></pre></div>
|
|||
|
<p>The locale always defaults to English to ensure that the default
|
|||
|
behaviour is identical across systems. Locales always include a two
|
|||
|
letter ISO-639-1 language code (like “en” for English or “zh” for
|
|||
|
Chinese), and optionally a ISO-3166 country code (like “en_UK” vs
|
|||
|
“en_US”). You can see a complete list of available locales by running
|
|||
|
<code>stringi::stri_locale_list()</code>.</p>
|
|||
|
</div>
|
|||
|
<div id="pattern-matching" class="section level2">
|
|||
|
<h2>Pattern matching</h2>
|
|||
|
<p>The vast majority of stringr functions work with patterns. These are
|
|||
|
parameterised by the task they perform and the types of patterns they
|
|||
|
match.</p>
|
|||
|
<div id="tasks" class="section level3">
|
|||
|
<h3>Tasks</h3>
|
|||
|
<p>Each pattern matching function has the same first two arguments, a
|
|||
|
character vector of <code>string</code>s to process and a single
|
|||
|
<code>pattern</code> to match. stringr provides pattern matching
|
|||
|
functions to <strong>detect</strong>, <strong>locate</strong>,
|
|||
|
<strong>extract</strong>, <strong>match</strong>,
|
|||
|
<strong>replace</strong>, and <strong>split</strong> strings. I’ll
|
|||
|
illustrate how they work with some strings and a regular expression
|
|||
|
designed to match (US) phone numbers:</p>
|
|||
|
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a>strings <span class="ot"><-</span> <span class="fu">c</span>(</span>
|
|||
|
<span id="cb12-2"><a href="#cb12-2" tabindex="-1"></a> <span class="st">"apple"</span>, </span>
|
|||
|
<span id="cb12-3"><a href="#cb12-3" tabindex="-1"></a> <span class="st">"219 733 8965"</span>, </span>
|
|||
|
<span id="cb12-4"><a href="#cb12-4" tabindex="-1"></a> <span class="st">"329-293-8753"</span>, </span>
|
|||
|
<span id="cb12-5"><a href="#cb12-5" tabindex="-1"></a> <span class="st">"Work: 579-499-7527; Home: 543.355.3679"</span></span>
|
|||
|
<span id="cb12-6"><a href="#cb12-6" tabindex="-1"></a>)</span>
|
|||
|
<span id="cb12-7"><a href="#cb12-7" tabindex="-1"></a>phone <span class="ot"><-</span> <span class="st">"([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"</span></span></code></pre></div>
|
|||
|
<ul>
|
|||
|
<li><p><code>str_detect()</code> detects the presence or absence of a
|
|||
|
pattern and returns a logical vector (similar to <code>grepl()</code>).
|
|||
|
<code>str_subset()</code> returns the elements of a character vector
|
|||
|
that match a regular expression (similar to <code>grep()</code> with
|
|||
|
<code>value = TRUE</code>)`.</p>
|
|||
|
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a><span class="co"># Which strings contain phone numbers?</span></span>
|
|||
|
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a><span class="fu">str_detect</span>(strings, phone)</span>
|
|||
|
<span id="cb13-3"><a href="#cb13-3" tabindex="-1"></a><span class="co">#> [1] FALSE TRUE TRUE TRUE</span></span>
|
|||
|
<span id="cb13-4"><a href="#cb13-4" tabindex="-1"></a><span class="fu">str_subset</span>(strings, phone)</span>
|
|||
|
<span id="cb13-5"><a href="#cb13-5" tabindex="-1"></a><span class="co">#> [1] "219 733 8965" </span></span>
|
|||
|
<span id="cb13-6"><a href="#cb13-6" tabindex="-1"></a><span class="co">#> [2] "329-293-8753" </span></span>
|
|||
|
<span id="cb13-7"><a href="#cb13-7" tabindex="-1"></a><span class="co">#> [3] "Work: 579-499-7527; Home: 543.355.3679"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_count()</code> counts the number of matches:</p>
|
|||
|
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a><span class="co"># How many phone numbers in each string?</span></span>
|
|||
|
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a><span class="fu">str_count</span>(strings, phone)</span>
|
|||
|
<span id="cb14-3"><a href="#cb14-3" tabindex="-1"></a><span class="co">#> [1] 0 1 1 2</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_locate()</code> locates the <strong>first</strong>
|
|||
|
position of a pattern and returns a numeric matrix with columns start
|
|||
|
and end. <code>str_locate_all()</code> locates all matches, returning a
|
|||
|
list of numeric matrices. Similar to <code>regexpr()</code> and
|
|||
|
<code>gregexpr()</code>.</p>
|
|||
|
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a><span class="co"># Where in the string is the phone number located?</span></span>
|
|||
|
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a>(loc <span class="ot"><-</span> <span class="fu">str_locate</span>(strings, phone))</span>
|
|||
|
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a><span class="co">#> start end</span></span>
|
|||
|
<span id="cb15-4"><a href="#cb15-4" tabindex="-1"></a><span class="co">#> [1,] NA NA</span></span>
|
|||
|
<span id="cb15-5"><a href="#cb15-5" tabindex="-1"></a><span class="co">#> [2,] 1 12</span></span>
|
|||
|
<span id="cb15-6"><a href="#cb15-6" tabindex="-1"></a><span class="co">#> [3,] 1 12</span></span>
|
|||
|
<span id="cb15-7"><a href="#cb15-7" tabindex="-1"></a><span class="co">#> [4,] 7 18</span></span>
|
|||
|
<span id="cb15-8"><a href="#cb15-8" tabindex="-1"></a><span class="fu">str_locate_all</span>(strings, phone)</span>
|
|||
|
<span id="cb15-9"><a href="#cb15-9" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb15-10"><a href="#cb15-10" tabindex="-1"></a><span class="co">#> start end</span></span>
|
|||
|
<span id="cb15-11"><a href="#cb15-11" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb15-12"><a href="#cb15-12" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb15-13"><a href="#cb15-13" tabindex="-1"></a><span class="co">#> start end</span></span>
|
|||
|
<span id="cb15-14"><a href="#cb15-14" tabindex="-1"></a><span class="co">#> [1,] 1 12</span></span>
|
|||
|
<span id="cb15-15"><a href="#cb15-15" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb15-16"><a href="#cb15-16" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
|
|||
|
<span id="cb15-17"><a href="#cb15-17" tabindex="-1"></a><span class="co">#> start end</span></span>
|
|||
|
<span id="cb15-18"><a href="#cb15-18" tabindex="-1"></a><span class="co">#> [1,] 1 12</span></span>
|
|||
|
<span id="cb15-19"><a href="#cb15-19" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb15-20"><a href="#cb15-20" tabindex="-1"></a><span class="co">#> [[4]]</span></span>
|
|||
|
<span id="cb15-21"><a href="#cb15-21" tabindex="-1"></a><span class="co">#> start end</span></span>
|
|||
|
<span id="cb15-22"><a href="#cb15-22" tabindex="-1"></a><span class="co">#> [1,] 7 18</span></span>
|
|||
|
<span id="cb15-23"><a href="#cb15-23" tabindex="-1"></a><span class="co">#> [2,] 27 38</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_extract()</code> extracts text corresponding to the
|
|||
|
<strong>first</strong> match, returning a character vector.
|
|||
|
<code>str_extract_all()</code> extracts all matches and returns a list
|
|||
|
of character vectors.</p>
|
|||
|
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a><span class="co"># What are the phone numbers?</span></span>
|
|||
|
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a><span class="fu">str_extract</span>(strings, phone)</span>
|
|||
|
<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a><span class="co">#> [1] NA "219 733 8965" "329-293-8753" "579-499-7527"</span></span>
|
|||
|
<span id="cb16-4"><a href="#cb16-4" tabindex="-1"></a><span class="fu">str_extract_all</span>(strings, phone)</span>
|
|||
|
<span id="cb16-5"><a href="#cb16-5" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb16-6"><a href="#cb16-6" tabindex="-1"></a><span class="co">#> character(0)</span></span>
|
|||
|
<span id="cb16-7"><a href="#cb16-7" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb16-8"><a href="#cb16-8" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb16-9"><a href="#cb16-9" tabindex="-1"></a><span class="co">#> [1] "219 733 8965"</span></span>
|
|||
|
<span id="cb16-10"><a href="#cb16-10" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb16-11"><a href="#cb16-11" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
|
|||
|
<span id="cb16-12"><a href="#cb16-12" tabindex="-1"></a><span class="co">#> [1] "329-293-8753"</span></span>
|
|||
|
<span id="cb16-13"><a href="#cb16-13" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb16-14"><a href="#cb16-14" tabindex="-1"></a><span class="co">#> [[4]]</span></span>
|
|||
|
<span id="cb16-15"><a href="#cb16-15" tabindex="-1"></a><span class="co">#> [1] "579-499-7527" "543.355.3679"</span></span>
|
|||
|
<span id="cb16-16"><a href="#cb16-16" tabindex="-1"></a><span class="fu">str_extract_all</span>(strings, phone, <span class="at">simplify =</span> <span class="cn">TRUE</span>)</span>
|
|||
|
<span id="cb16-17"><a href="#cb16-17" tabindex="-1"></a><span class="co">#> [,1] [,2] </span></span>
|
|||
|
<span id="cb16-18"><a href="#cb16-18" tabindex="-1"></a><span class="co">#> [1,] "" "" </span></span>
|
|||
|
<span id="cb16-19"><a href="#cb16-19" tabindex="-1"></a><span class="co">#> [2,] "219 733 8965" "" </span></span>
|
|||
|
<span id="cb16-20"><a href="#cb16-20" tabindex="-1"></a><span class="co">#> [3,] "329-293-8753" "" </span></span>
|
|||
|
<span id="cb16-21"><a href="#cb16-21" tabindex="-1"></a><span class="co">#> [4,] "579-499-7527" "543.355.3679"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_match()</code> extracts capture groups formed by
|
|||
|
<code>()</code> from the <strong>first</strong> match. It returns a
|
|||
|
character matrix with one column for the complete match and one column
|
|||
|
for each group. <code>str_match_all()</code> extracts capture groups
|
|||
|
from all matches and returns a list of character matrices. Similar to
|
|||
|
<code>regmatches()</code>.</p>
|
|||
|
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="co"># Pull out the three components of the match</span></span>
|
|||
|
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="fu">str_match</span>(strings, phone)</span>
|
|||
|
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4] </span></span>
|
|||
|
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a><span class="co">#> [1,] NA NA NA NA </span></span>
|
|||
|
<span id="cb17-5"><a href="#cb17-5" tabindex="-1"></a><span class="co">#> [2,] "219 733 8965" "219" "733" "8965"</span></span>
|
|||
|
<span id="cb17-6"><a href="#cb17-6" tabindex="-1"></a><span class="co">#> [3,] "329-293-8753" "329" "293" "8753"</span></span>
|
|||
|
<span id="cb17-7"><a href="#cb17-7" tabindex="-1"></a><span class="co">#> [4,] "579-499-7527" "579" "499" "7527"</span></span>
|
|||
|
<span id="cb17-8"><a href="#cb17-8" tabindex="-1"></a><span class="fu">str_match_all</span>(strings, phone)</span>
|
|||
|
<span id="cb17-9"><a href="#cb17-9" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb17-10"><a href="#cb17-10" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4]</span></span>
|
|||
|
<span id="cb17-11"><a href="#cb17-11" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb17-12"><a href="#cb17-12" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
|
|||
|
<span id="cb17-13"><a href="#cb17-13" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4] </span></span>
|
|||
|
<span id="cb17-14"><a href="#cb17-14" tabindex="-1"></a><span class="co">#> [1,] "219 733 8965" "219" "733" "8965"</span></span>
|
|||
|
<span id="cb17-15"><a href="#cb17-15" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb17-16"><a href="#cb17-16" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
|
|||
|
<span id="cb17-17"><a href="#cb17-17" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4] </span></span>
|
|||
|
<span id="cb17-18"><a href="#cb17-18" tabindex="-1"></a><span class="co">#> [1,] "329-293-8753" "329" "293" "8753"</span></span>
|
|||
|
<span id="cb17-19"><a href="#cb17-19" tabindex="-1"></a><span class="co">#> </span></span>
|
|||
|
<span id="cb17-20"><a href="#cb17-20" tabindex="-1"></a><span class="co">#> [[4]]</span></span>
|
|||
|
<span id="cb17-21"><a href="#cb17-21" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4] </span></span>
|
|||
|
<span id="cb17-22"><a href="#cb17-22" tabindex="-1"></a><span class="co">#> [1,] "579-499-7527" "579" "499" "7527"</span></span>
|
|||
|
<span id="cb17-23"><a href="#cb17-23" tabindex="-1"></a><span class="co">#> [2,] "543.355.3679" "543" "355" "3679"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_replace()</code> replaces the <strong>first</strong>
|
|||
|
matched pattern and returns a character vector.
|
|||
|
<code>str_replace_all()</code> replaces all matches. Similar to
|
|||
|
<code>sub()</code> and <code>gsub()</code>.</p>
|
|||
|
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a><span class="fu">str_replace</span>(strings, phone, <span class="st">"XXX-XXX-XXXX"</span>)</span>
|
|||
|
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a><span class="co">#> [1] "apple" </span></span>
|
|||
|
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a><span class="co">#> [2] "XXX-XXX-XXXX" </span></span>
|
|||
|
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="co">#> [3] "XXX-XXX-XXXX" </span></span>
|
|||
|
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a><span class="co">#> [4] "Work: XXX-XXX-XXXX; Home: 543.355.3679"</span></span>
|
|||
|
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a><span class="fu">str_replace_all</span>(strings, phone, <span class="st">"XXX-XXX-XXXX"</span>)</span>
|
|||
|
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a><span class="co">#> [1] "apple" </span></span>
|
|||
|
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a><span class="co">#> [2] "XXX-XXX-XXXX" </span></span>
|
|||
|
<span id="cb18-9"><a href="#cb18-9" tabindex="-1"></a><span class="co">#> [3] "XXX-XXX-XXXX" </span></span>
|
|||
|
<span id="cb18-10"><a href="#cb18-10" tabindex="-1"></a><span class="co">#> [4] "Work: XXX-XXX-XXXX; Home: XXX-XXX-XXXX"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>str_split_fixed()</code> splits a string into a
|
|||
|
<strong>fixed</strong> number of pieces based on a pattern and returns a
|
|||
|
character matrix. <code>str_split()</code> splits a string into a
|
|||
|
<strong>variable</strong> number of pieces and returns a list of
|
|||
|
character vectors.</p>
|
|||
|
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="fu">str_split</span>(<span class="st">"a-b-c"</span>, <span class="st">"-"</span>)</span>
|
|||
|
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a><span class="co">#> [1] "a" "b" "c"</span></span>
|
|||
|
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a><span class="fu">str_split_fixed</span>(<span class="st">"a-b-c"</span>, <span class="st">"-"</span>, <span class="at">n =</span> <span class="dv">2</span>)</span>
|
|||
|
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a><span class="co">#> [,1] [,2] </span></span>
|
|||
|
<span id="cb19-6"><a href="#cb19-6" tabindex="-1"></a><span class="co">#> [1,] "a" "b-c"</span></span></code></pre></div></li>
|
|||
|
</ul>
|
|||
|
</div>
|
|||
|
<div id="engines" class="section level3">
|
|||
|
<h3>Engines</h3>
|
|||
|
<p>There are four main engines that stringr can use to describe
|
|||
|
patterns:</p>
|
|||
|
<ul>
|
|||
|
<li><p>Regular expressions, the default, as shown above, and described
|
|||
|
in <code>vignette("regular-expressions")</code>.</p></li>
|
|||
|
<li><p>Fixed bytewise matching, with <code>fixed()</code>.</p></li>
|
|||
|
<li><p>Locale-sensitive character matching, with
|
|||
|
<code>coll()</code></p></li>
|
|||
|
<li><p>Text boundary analysis with <code>boundary()</code>.</p></li>
|
|||
|
</ul>
|
|||
|
<div id="fixed-matches" class="section level4">
|
|||
|
<h4>Fixed matches</h4>
|
|||
|
<p><code>fixed(x)</code> only matches the exact sequence of bytes
|
|||
|
specified by <code>x</code>. This is a very limited “pattern”, but the
|
|||
|
restriction can make matching much faster. Beware using
|
|||
|
<code>fixed()</code> with non-English data. It is problematic because
|
|||
|
there are often multiple ways of representing the same character. For
|
|||
|
example, there are two ways to define “á”: either as a single character
|
|||
|
or as an “a” plus an accent:</p>
|
|||
|
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a>a1 <span class="ot"><-</span> <span class="st">"\u00e1"</span></span>
|
|||
|
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a>a2 <span class="ot"><-</span> <span class="st">"a\u0301"</span></span>
|
|||
|
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a><span class="fu">c</span>(a1, a2)</span>
|
|||
|
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a><span class="co">#> [1] "á" "á"</span></span>
|
|||
|
<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a>a1 <span class="sc">==</span> a2</span>
|
|||
|
<span id="cb20-6"><a href="#cb20-6" tabindex="-1"></a><span class="co">#> [1] FALSE</span></span></code></pre></div>
|
|||
|
<p>They render identically, but because they’re defined differently,
|
|||
|
<code>fixed()</code> doesn’t find a match. Instead, you can use
|
|||
|
<code>coll()</code>, explained below, to respect human character
|
|||
|
comparison rules:</p>
|
|||
|
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a><span class="fu">str_detect</span>(a1, <span class="fu">fixed</span>(a2))</span>
|
|||
|
<span id="cb21-2"><a href="#cb21-2" tabindex="-1"></a><span class="co">#> [1] FALSE</span></span>
|
|||
|
<span id="cb21-3"><a href="#cb21-3" tabindex="-1"></a><span class="fu">str_detect</span>(a1, <span class="fu">coll</span>(a2))</span>
|
|||
|
<span id="cb21-4"><a href="#cb21-4" tabindex="-1"></a><span class="co">#> [1] TRUE</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="collation-search" class="section level4">
|
|||
|
<h4>Collation search</h4>
|
|||
|
<p><code>coll(x)</code> looks for a match to <code>x</code> using
|
|||
|
human-language <strong>coll</strong>ation rules, and is particularly
|
|||
|
important if you want to do case insensitive matching. Collation rules
|
|||
|
differ around the world, so you’ll also need to supply a
|
|||
|
<code>locale</code> parameter.</p>
|
|||
|
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a>i <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"I"</span>, <span class="st">"İ"</span>, <span class="st">"i"</span>, <span class="st">"ı"</span>)</span>
|
|||
|
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a>i</span>
|
|||
|
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="co">#> [1] "I" "İ" "i" "ı"</span></span>
|
|||
|
<span id="cb22-4"><a href="#cb22-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb22-5"><a href="#cb22-5" tabindex="-1"></a><span class="fu">str_subset</span>(i, <span class="fu">coll</span>(<span class="st">"i"</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>))</span>
|
|||
|
<span id="cb22-6"><a href="#cb22-6" tabindex="-1"></a><span class="co">#> [1] "I" "i"</span></span>
|
|||
|
<span id="cb22-7"><a href="#cb22-7" tabindex="-1"></a><span class="fu">str_subset</span>(i, <span class="fu">coll</span>(<span class="st">"i"</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>, <span class="at">locale =</span> <span class="st">"tr"</span>))</span>
|
|||
|
<span id="cb22-8"><a href="#cb22-8" tabindex="-1"></a><span class="co">#> [1] "İ" "i"</span></span></code></pre></div>
|
|||
|
<p>The downside of <code>coll()</code> is speed. Because the rules for
|
|||
|
recognising which characters are the same are complicated,
|
|||
|
<code>coll()</code> is relatively slow compared to <code>regex()</code>
|
|||
|
and <code>fixed()</code>. Note that when both <code>fixed()</code> and
|
|||
|
<code>regex()</code> have <code>ignore_case</code> arguments, they
|
|||
|
perform a much simpler comparison than <code>coll()</code>.</p>
|
|||
|
</div>
|
|||
|
<div id="boundary" class="section level4">
|
|||
|
<h4>Boundary</h4>
|
|||
|
<p><code>boundary()</code> matches boundaries between characters, lines,
|
|||
|
sentences or words. It’s most useful with <code>str_split()</code>, but
|
|||
|
can be used with all pattern matching functions:</p>
|
|||
|
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"This is a sentence."</span></span>
|
|||
|
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a><span class="fu">str_split</span>(x, <span class="fu">boundary</span>(<span class="st">"word"</span>))</span>
|
|||
|
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a><span class="co">#> [1] "This" "is" "a" "sentence"</span></span>
|
|||
|
<span id="cb23-5"><a href="#cb23-5" tabindex="-1"></a><span class="fu">str_count</span>(x, <span class="fu">boundary</span>(<span class="st">"word"</span>))</span>
|
|||
|
<span id="cb23-6"><a href="#cb23-6" tabindex="-1"></a><span class="co">#> [1] 4</span></span>
|
|||
|
<span id="cb23-7"><a href="#cb23-7" tabindex="-1"></a><span class="fu">str_extract_all</span>(x, <span class="fu">boundary</span>(<span class="st">"word"</span>))</span>
|
|||
|
<span id="cb23-8"><a href="#cb23-8" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb23-9"><a href="#cb23-9" tabindex="-1"></a><span class="co">#> [1] "This" "is" "a" "sentence"</span></span></code></pre></div>
|
|||
|
<p>By convention, <code>""</code> is treated as
|
|||
|
<code>boundary("character")</code>:</p>
|
|||
|
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">str_split</span>(x, <span class="st">""</span>)</span>
|
|||
|
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
|
|||
|
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a><span class="co">#> [1] "T" "h" "i" "s" " " "i" "s" " " "a" " " "s" "e" "n" "t" "e" "n" "c" "e" "."</span></span>
|
|||
|
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a><span class="fu">str_count</span>(x, <span class="st">""</span>)</span>
|
|||
|
<span id="cb24-5"><a href="#cb24-5" tabindex="-1"></a><span class="co">#> [1] 19</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<!-- code folding -->
|
|||
|
|
|||
|
|
|||
|
<!-- dynamically load mathjax for compatibility with self-contained -->
|
|||
|
<script>
|
|||
|
(function () {
|
|||
|
var script = document.createElement("script");
|
|||
|
script.type = "text/javascript";
|
|||
|
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
|||
|
document.getElementsByTagName("head")[0].appendChild(script);
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|