719 lines
44 KiB
HTML
Raw Permalink Normal View History

2025-01-12 00:52:51 +08:00
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Introduction to stringr</title>
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
var i, h, a;
for (i = 0; i < hs.length; i++) {
h = hs[i];
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
a = h.attributes;
while (a.length > 0) h.removeAttribute(a[0].name);
}
});
</script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; }
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.at { color: #7d9029; }
code span.bn { color: #40a070; }
code span.bu { color: #008000; }
code span.cf { color: #007020; font-weight: bold; }
code span.ch { color: #4070a0; }
code span.cn { color: #880000; }
code span.co { color: #60a0b0; font-style: italic; }
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.do { color: #ba2121; font-style: italic; }
code span.dt { color: #902000; }
code span.dv { color: #40a070; }
code span.er { color: #ff0000; font-weight: bold; }
code span.ex { }
code span.fl { color: #40a070; }
code span.fu { color: #06287e; }
code span.im { color: #008000; font-weight: bold; }
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.kw { color: #007020; font-weight: bold; }
code span.op { color: #666666; }
code span.ot { color: #007020; }
code span.pp { color: #bc7a00; }
code span.sc { color: #4070a0; }
code span.ss { color: #bb6688; }
code span.st { color: #4070a0; }
code span.va { color: #19177c; }
code span.vs { color: #4070a0; }
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
var sheets = document.styleSheets;
for (var i = 0; i < sheets.length; i++) {
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
var j = 0;
while (j < rules.length) {
var rule = rules[j];
// check if there is a div.sourceCode rule
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
j++;
continue;
}
var style = rule.style.cssText;
// check if color or background-color is set
if (rule.style.color === '' && rule.style.backgroundColor === '') {
j++;
continue;
}
// replace div.sourceCode by a pre.sourceCode rule
sheets[i].deleteRule(j);
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
}
}
})();
</script>
<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap;
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }
code > span.kw { color: #555; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #d14; }
code > span.fl { color: #d14; }
code > span.ch { color: #d14; }
code > span.st { color: #d14; }
code > span.co { color: #888888; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #900; font-weight: bold; }
code > span.er { color: #a61717; background-color: #e3d2d2; }
</style>
</head>
<body>
<h1 class="title toc-ignore">Introduction to stringr</h1>
<p>There are four main families of functions in stringr:</p>
<ol style="list-style-type: decimal">
<li><p>Character manipulation: these functions allow you to manipulate
individual characters within the strings in character vectors.</p></li>
<li><p>Whitespace tools to add, remove, and manipulate
whitespace.</p></li>
<li><p>Locale sensitive operations whose operations will vary from
locale to locale.</p></li>
<li><p>Pattern matching functions. These recognise four engines of
pattern description. The most common is regular expressions, but there
are three other tools.</p></li>
</ol>
<div id="getting-and-setting-individual-characters" class="section level2">
<h2>Getting and setting individual characters</h2>
<p>You can get the length of the string with
<code>str_length()</code>:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="fu">str_length</span>(<span class="st">&quot;abc&quot;</span>)</span>
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="co">#&gt; [1] 3</span></span></code></pre></div>
<p>This is now equivalent to the base R function <code>nchar()</code>.
Previously it was needed to work around issues with <code>nchar()</code>
such as the fact that it returned 2 for <code>nchar(NA)</code>. This has
been fixed as of R 3.3.0, so it is no longer so important.</p>
<p>You can access individual character using <code>str_sub()</code>. It
takes three arguments: a character vector, a <code>start</code> position
and an <code>end</code> position. Either position can either be a
positive integer, which counts from the left, or a negative integer
which counts from the right. The positions are inclusive, and if longer
than the string, will be silently truncated.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;abcdef&quot;</span>, <span class="st">&quot;ghifjk&quot;</span>)</span>
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a></span>
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="co"># The 3rd letter</span></span>
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">3</span>, <span class="dv">3</span>)</span>
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a><span class="co">#&gt; [1] &quot;c&quot; &quot;i&quot;</span></span>
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a></span>
<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="co"># The 2nd to 2nd-to-last character</span></span>
<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">2</span>, <span class="sc">-</span><span class="dv">2</span>)</span>
<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a><span class="co">#&gt; [1] &quot;bcde&quot; &quot;hifj&quot;</span></span></code></pre></div>
<p>You can also use <code>str_sub()</code> to modify strings:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">3</span>, <span class="dv">3</span>) <span class="ot">&lt;-</span> <span class="st">&quot;X&quot;</span></span>
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a>x</span>
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;abXdef&quot; &quot;ghXfjk&quot;</span></span></code></pre></div>
<p>To duplicate individual strings, you can use
<code>str_dup()</code>:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">str_dup</span>(x, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">3</span>))</span>
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="co">#&gt; [1] &quot;abXdefabXdef&quot; &quot;ghXfjkghXfjkghXfjk&quot;</span></span></code></pre></div>
</div>
<div id="whitespace" class="section level2">
<h2>Whitespace</h2>
<p>Three functions add, remove, or modify whitespace:</p>
<ol style="list-style-type: decimal">
<li><p><code>str_pad()</code> pads a string to a fixed length by adding
extra whitespace on the left, right, or both sides.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;abc&quot;</span>, <span class="st">&quot;defghi&quot;</span>)</span>
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">10</span>) <span class="co"># default pads on left</span></span>
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot; abc&quot; &quot; defghi&quot;</span></span>
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">10</span>, <span class="st">&quot;both&quot;</span>)</span>
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="co">#&gt; [1] &quot; abc &quot; &quot; defghi &quot;</span></span></code></pre></div>
<p>(You can pad with other characters by using the <code>pad</code>
argument.)</p>
<p><code>str_pad()</code> will never make a string shorter:</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="fu">str_pad</span>(x, <span class="dv">4</span>)</span>
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co">#&gt; [1] &quot; abc&quot; &quot;defghi&quot;</span></span></code></pre></div>
<p>So if you want to ensure that all strings are the same length (often
useful for print methods), combine <code>str_pad()</code> and
<code>str_trunc()</code>:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;Short&quot;</span>, <span class="st">&quot;This is a long string&quot;</span>)</span>
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a></span>
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a>x <span class="sc">%&gt;%</span> </span>
<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a> <span class="fu">str_trunc</span>(<span class="dv">10</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a> <span class="fu">str_pad</span>(<span class="dv">10</span>, <span class="st">&quot;right&quot;</span>)</span>
<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a><span class="co">#&gt; [1] &quot;Short &quot; &quot;This is...&quot;</span></span></code></pre></div></li>
<li><p>The opposite of <code>str_pad()</code> is
<code>str_trim()</code>, which removes leading and trailing
whitespace:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot; a &quot;</span>, <span class="st">&quot;b &quot;</span>, <span class="st">&quot; c&quot;</span>)</span>
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a><span class="fu">str_trim</span>(x)</span>
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;a&quot; &quot;b&quot; &quot;c&quot;</span></span>
<span id="cb8-4"><a href="#cb8-4" tabindex="-1"></a><span class="fu">str_trim</span>(x, <span class="st">&quot;left&quot;</span>)</span>
<span id="cb8-5"><a href="#cb8-5" tabindex="-1"></a><span class="co">#&gt; [1] &quot;a &quot; &quot;b &quot; &quot;c&quot;</span></span></code></pre></div></li>
<li><p>You can use <code>str_wrap()</code> to modify existing whitespace
in order to wrap a paragraph of text, such that the length of each line
is as similar as possible.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>jabberwocky <span class="ot">&lt;-</span> <span class="fu">str_c</span>(</span>
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a> <span class="st">&quot;`Twas brillig, and the slithy toves &quot;</span>,</span>
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a> <span class="st">&quot;did gyre and gimble in the wabe: &quot;</span>,</span>
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a> <span class="st">&quot;All mimsy were the borogoves, &quot;</span>,</span>
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a> <span class="st">&quot;and the mome raths outgrabe. &quot;</span></span>
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a>)</span>
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a><span class="fu">cat</span>(<span class="fu">str_wrap</span>(jabberwocky, <span class="at">width =</span> <span class="dv">40</span>))</span>
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a><span class="co">#&gt; `Twas brillig, and the slithy toves did</span></span>
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a><span class="co">#&gt; gyre and gimble in the wabe: All mimsy</span></span>
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a><span class="co">#&gt; were the borogoves, and the mome raths</span></span>
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a><span class="co">#&gt; outgrabe.</span></span></code></pre></div></li>
</ol>
</div>
<div id="locale-sensitive" class="section level2">
<h2>Locale sensitive</h2>
<p>A handful of stringr functions are locale-sensitive: they will
perform differently in different regions of the world. These functions
are case transformation functions:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="st">&quot;I like horses.&quot;</span></span>
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a><span class="fu">str_to_upper</span>(x)</span>
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;I LIKE HORSES.&quot;</span></span>
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a><span class="fu">str_to_title</span>(x)</span>
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a><span class="co">#&gt; [1] &quot;I Like Horses.&quot;</span></span>
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a></span>
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a><span class="fu">str_to_lower</span>(x)</span>
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a><span class="co">#&gt; [1] &quot;i like horses.&quot;</span></span>
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a><span class="co"># Turkish has two sorts of i: with and without the dot</span></span>
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a><span class="fu">str_to_lower</span>(x, <span class="st">&quot;tr&quot;</span>)</span>
<span id="cb10-11"><a href="#cb10-11" tabindex="-1"></a><span class="co">#&gt; [1] &quot;ı like horses.&quot;</span></span></code></pre></div>
<p>String ordering and sorting:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;y&quot;</span>, <span class="st">&quot;i&quot;</span>, <span class="st">&quot;k&quot;</span>)</span>
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a><span class="fu">str_order</span>(x)</span>
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a><span class="co">#&gt; [1] 2 3 1</span></span>
<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a></span>
<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a><span class="fu">str_sort</span>(x)</span>
<span id="cb11-6"><a href="#cb11-6" tabindex="-1"></a><span class="co">#&gt; [1] &quot;i&quot; &quot;k&quot; &quot;y&quot;</span></span>
<span id="cb11-7"><a href="#cb11-7" tabindex="-1"></a><span class="co"># In Lithuanian, y comes between i and k</span></span>
<span id="cb11-8"><a href="#cb11-8" tabindex="-1"></a><span class="fu">str_sort</span>(x, <span class="at">locale =</span> <span class="st">&quot;lt&quot;</span>)</span>
<span id="cb11-9"><a href="#cb11-9" tabindex="-1"></a><span class="co">#&gt; [1] &quot;i&quot; &quot;y&quot; &quot;k&quot;</span></span></code></pre></div>
<p>The locale always defaults to English to ensure that the default
behaviour is identical across systems. Locales always include a two
letter ISO-639-1 language code (like “en” for English or “zh” for
Chinese), and optionally a ISO-3166 country code (like “en_UK” vs
“en_US”). You can see a complete list of available locales by running
<code>stringi::stri_locale_list()</code>.</p>
</div>
<div id="pattern-matching" class="section level2">
<h2>Pattern matching</h2>
<p>The vast majority of stringr functions work with patterns. These are
parameterised by the task they perform and the types of patterns they
match.</p>
<div id="tasks" class="section level3">
<h3>Tasks</h3>
<p>Each pattern matching function has the same first two arguments, a
character vector of <code>string</code>s to process and a single
<code>pattern</code> to match. stringr provides pattern matching
functions to <strong>detect</strong>, <strong>locate</strong>,
<strong>extract</strong>, <strong>match</strong>,
<strong>replace</strong>, and <strong>split</strong> strings. Ill
illustrate how they work with some strings and a regular expression
designed to match (US) phone numbers:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a>strings <span class="ot">&lt;-</span> <span class="fu">c</span>(</span>
<span id="cb12-2"><a href="#cb12-2" tabindex="-1"></a> <span class="st">&quot;apple&quot;</span>, </span>
<span id="cb12-3"><a href="#cb12-3" tabindex="-1"></a> <span class="st">&quot;219 733 8965&quot;</span>, </span>
<span id="cb12-4"><a href="#cb12-4" tabindex="-1"></a> <span class="st">&quot;329-293-8753&quot;</span>, </span>
<span id="cb12-5"><a href="#cb12-5" tabindex="-1"></a> <span class="st">&quot;Work: 579-499-7527; Home: 543.355.3679&quot;</span></span>
<span id="cb12-6"><a href="#cb12-6" tabindex="-1"></a>)</span>
<span id="cb12-7"><a href="#cb12-7" tabindex="-1"></a>phone <span class="ot">&lt;-</span> <span class="st">&quot;([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})&quot;</span></span></code></pre></div>
<ul>
<li><p><code>str_detect()</code> detects the presence or absence of a
pattern and returns a logical vector (similar to <code>grepl()</code>).
<code>str_subset()</code> returns the elements of a character vector
that match a regular expression (similar to <code>grep()</code> with
<code>value = TRUE</code>)`.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a><span class="co"># Which strings contain phone numbers?</span></span>
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a><span class="fu">str_detect</span>(strings, phone)</span>
<span id="cb13-3"><a href="#cb13-3" tabindex="-1"></a><span class="co">#&gt; [1] FALSE TRUE TRUE TRUE</span></span>
<span id="cb13-4"><a href="#cb13-4" tabindex="-1"></a><span class="fu">str_subset</span>(strings, phone)</span>
<span id="cb13-5"><a href="#cb13-5" tabindex="-1"></a><span class="co">#&gt; [1] &quot;219 733 8965&quot; </span></span>
<span id="cb13-6"><a href="#cb13-6" tabindex="-1"></a><span class="co">#&gt; [2] &quot;329-293-8753&quot; </span></span>
<span id="cb13-7"><a href="#cb13-7" tabindex="-1"></a><span class="co">#&gt; [3] &quot;Work: 579-499-7527; Home: 543.355.3679&quot;</span></span></code></pre></div></li>
<li><p><code>str_count()</code> counts the number of matches:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a><span class="co"># How many phone numbers in each string?</span></span>
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a><span class="fu">str_count</span>(strings, phone)</span>
<span id="cb14-3"><a href="#cb14-3" tabindex="-1"></a><span class="co">#&gt; [1] 0 1 1 2</span></span></code></pre></div></li>
<li><p><code>str_locate()</code> locates the <strong>first</strong>
position of a pattern and returns a numeric matrix with columns start
and end. <code>str_locate_all()</code> locates all matches, returning a
list of numeric matrices. Similar to <code>regexpr()</code> and
<code>gregexpr()</code>.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a><span class="co"># Where in the string is the phone number located?</span></span>
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a>(loc <span class="ot">&lt;-</span> <span class="fu">str_locate</span>(strings, phone))</span>
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a><span class="co">#&gt; start end</span></span>
<span id="cb15-4"><a href="#cb15-4" tabindex="-1"></a><span class="co">#&gt; [1,] NA NA</span></span>
<span id="cb15-5"><a href="#cb15-5" tabindex="-1"></a><span class="co">#&gt; [2,] 1 12</span></span>
<span id="cb15-6"><a href="#cb15-6" tabindex="-1"></a><span class="co">#&gt; [3,] 1 12</span></span>
<span id="cb15-7"><a href="#cb15-7" tabindex="-1"></a><span class="co">#&gt; [4,] 7 18</span></span>
<span id="cb15-8"><a href="#cb15-8" tabindex="-1"></a><span class="fu">str_locate_all</span>(strings, phone)</span>
<span id="cb15-9"><a href="#cb15-9" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb15-10"><a href="#cb15-10" tabindex="-1"></a><span class="co">#&gt; start end</span></span>
<span id="cb15-11"><a href="#cb15-11" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb15-12"><a href="#cb15-12" tabindex="-1"></a><span class="co">#&gt; [[2]]</span></span>
<span id="cb15-13"><a href="#cb15-13" tabindex="-1"></a><span class="co">#&gt; start end</span></span>
<span id="cb15-14"><a href="#cb15-14" tabindex="-1"></a><span class="co">#&gt; [1,] 1 12</span></span>
<span id="cb15-15"><a href="#cb15-15" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb15-16"><a href="#cb15-16" tabindex="-1"></a><span class="co">#&gt; [[3]]</span></span>
<span id="cb15-17"><a href="#cb15-17" tabindex="-1"></a><span class="co">#&gt; start end</span></span>
<span id="cb15-18"><a href="#cb15-18" tabindex="-1"></a><span class="co">#&gt; [1,] 1 12</span></span>
<span id="cb15-19"><a href="#cb15-19" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb15-20"><a href="#cb15-20" tabindex="-1"></a><span class="co">#&gt; [[4]]</span></span>
<span id="cb15-21"><a href="#cb15-21" tabindex="-1"></a><span class="co">#&gt; start end</span></span>
<span id="cb15-22"><a href="#cb15-22" tabindex="-1"></a><span class="co">#&gt; [1,] 7 18</span></span>
<span id="cb15-23"><a href="#cb15-23" tabindex="-1"></a><span class="co">#&gt; [2,] 27 38</span></span></code></pre></div></li>
<li><p><code>str_extract()</code> extracts text corresponding to the
<strong>first</strong> match, returning a character vector.
<code>str_extract_all()</code> extracts all matches and returns a list
of character vectors.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a><span class="co"># What are the phone numbers?</span></span>
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a><span class="fu">str_extract</span>(strings, phone)</span>
<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a><span class="co">#&gt; [1] NA &quot;219 733 8965&quot; &quot;329-293-8753&quot; &quot;579-499-7527&quot;</span></span>
<span id="cb16-4"><a href="#cb16-4" tabindex="-1"></a><span class="fu">str_extract_all</span>(strings, phone)</span>
<span id="cb16-5"><a href="#cb16-5" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb16-6"><a href="#cb16-6" tabindex="-1"></a><span class="co">#&gt; character(0)</span></span>
<span id="cb16-7"><a href="#cb16-7" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb16-8"><a href="#cb16-8" tabindex="-1"></a><span class="co">#&gt; [[2]]</span></span>
<span id="cb16-9"><a href="#cb16-9" tabindex="-1"></a><span class="co">#&gt; [1] &quot;219 733 8965&quot;</span></span>
<span id="cb16-10"><a href="#cb16-10" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb16-11"><a href="#cb16-11" tabindex="-1"></a><span class="co">#&gt; [[3]]</span></span>
<span id="cb16-12"><a href="#cb16-12" tabindex="-1"></a><span class="co">#&gt; [1] &quot;329-293-8753&quot;</span></span>
<span id="cb16-13"><a href="#cb16-13" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb16-14"><a href="#cb16-14" tabindex="-1"></a><span class="co">#&gt; [[4]]</span></span>
<span id="cb16-15"><a href="#cb16-15" tabindex="-1"></a><span class="co">#&gt; [1] &quot;579-499-7527&quot; &quot;543.355.3679&quot;</span></span>
<span id="cb16-16"><a href="#cb16-16" tabindex="-1"></a><span class="fu">str_extract_all</span>(strings, phone, <span class="at">simplify =</span> <span class="cn">TRUE</span>)</span>
<span id="cb16-17"><a href="#cb16-17" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] </span></span>
<span id="cb16-18"><a href="#cb16-18" tabindex="-1"></a><span class="co">#&gt; [1,] &quot;&quot; &quot;&quot; </span></span>
<span id="cb16-19"><a href="#cb16-19" tabindex="-1"></a><span class="co">#&gt; [2,] &quot;219 733 8965&quot; &quot;&quot; </span></span>
<span id="cb16-20"><a href="#cb16-20" tabindex="-1"></a><span class="co">#&gt; [3,] &quot;329-293-8753&quot; &quot;&quot; </span></span>
<span id="cb16-21"><a href="#cb16-21" tabindex="-1"></a><span class="co">#&gt; [4,] &quot;579-499-7527&quot; &quot;543.355.3679&quot;</span></span></code></pre></div></li>
<li><p><code>str_match()</code> extracts capture groups formed by
<code>()</code> from the <strong>first</strong> match. It returns a
character matrix with one column for the complete match and one column
for each group. <code>str_match_all()</code> extracts capture groups
from all matches and returns a list of character matrices. Similar to
<code>regmatches()</code>.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="co"># Pull out the three components of the match</span></span>
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="fu">str_match</span>(strings, phone)</span>
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] [,3] [,4] </span></span>
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a><span class="co">#&gt; [1,] NA NA NA NA </span></span>
<span id="cb17-5"><a href="#cb17-5" tabindex="-1"></a><span class="co">#&gt; [2,] &quot;219 733 8965&quot; &quot;219&quot; &quot;733&quot; &quot;8965&quot;</span></span>
<span id="cb17-6"><a href="#cb17-6" tabindex="-1"></a><span class="co">#&gt; [3,] &quot;329-293-8753&quot; &quot;329&quot; &quot;293&quot; &quot;8753&quot;</span></span>
<span id="cb17-7"><a href="#cb17-7" tabindex="-1"></a><span class="co">#&gt; [4,] &quot;579-499-7527&quot; &quot;579&quot; &quot;499&quot; &quot;7527&quot;</span></span>
<span id="cb17-8"><a href="#cb17-8" tabindex="-1"></a><span class="fu">str_match_all</span>(strings, phone)</span>
<span id="cb17-9"><a href="#cb17-9" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb17-10"><a href="#cb17-10" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] [,3] [,4]</span></span>
<span id="cb17-11"><a href="#cb17-11" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb17-12"><a href="#cb17-12" tabindex="-1"></a><span class="co">#&gt; [[2]]</span></span>
<span id="cb17-13"><a href="#cb17-13" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] [,3] [,4] </span></span>
<span id="cb17-14"><a href="#cb17-14" tabindex="-1"></a><span class="co">#&gt; [1,] &quot;219 733 8965&quot; &quot;219&quot; &quot;733&quot; &quot;8965&quot;</span></span>
<span id="cb17-15"><a href="#cb17-15" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb17-16"><a href="#cb17-16" tabindex="-1"></a><span class="co">#&gt; [[3]]</span></span>
<span id="cb17-17"><a href="#cb17-17" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] [,3] [,4] </span></span>
<span id="cb17-18"><a href="#cb17-18" tabindex="-1"></a><span class="co">#&gt; [1,] &quot;329-293-8753&quot; &quot;329&quot; &quot;293&quot; &quot;8753&quot;</span></span>
<span id="cb17-19"><a href="#cb17-19" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb17-20"><a href="#cb17-20" tabindex="-1"></a><span class="co">#&gt; [[4]]</span></span>
<span id="cb17-21"><a href="#cb17-21" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] [,3] [,4] </span></span>
<span id="cb17-22"><a href="#cb17-22" tabindex="-1"></a><span class="co">#&gt; [1,] &quot;579-499-7527&quot; &quot;579&quot; &quot;499&quot; &quot;7527&quot;</span></span>
<span id="cb17-23"><a href="#cb17-23" tabindex="-1"></a><span class="co">#&gt; [2,] &quot;543.355.3679&quot; &quot;543&quot; &quot;355&quot; &quot;3679&quot;</span></span></code></pre></div></li>
<li><p><code>str_replace()</code> replaces the <strong>first</strong>
matched pattern and returns a character vector.
<code>str_replace_all()</code> replaces all matches. Similar to
<code>sub()</code> and <code>gsub()</code>.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a><span class="fu">str_replace</span>(strings, phone, <span class="st">&quot;XXX-XXX-XXXX&quot;</span>)</span>
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a><span class="co">#&gt; [1] &quot;apple&quot; </span></span>
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a><span class="co">#&gt; [2] &quot;XXX-XXX-XXXX&quot; </span></span>
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="co">#&gt; [3] &quot;XXX-XXX-XXXX&quot; </span></span>
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a><span class="co">#&gt; [4] &quot;Work: XXX-XXX-XXXX; Home: 543.355.3679&quot;</span></span>
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a><span class="fu">str_replace_all</span>(strings, phone, <span class="st">&quot;XXX-XXX-XXXX&quot;</span>)</span>
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a><span class="co">#&gt; [1] &quot;apple&quot; </span></span>
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a><span class="co">#&gt; [2] &quot;XXX-XXX-XXXX&quot; </span></span>
<span id="cb18-9"><a href="#cb18-9" tabindex="-1"></a><span class="co">#&gt; [3] &quot;XXX-XXX-XXXX&quot; </span></span>
<span id="cb18-10"><a href="#cb18-10" tabindex="-1"></a><span class="co">#&gt; [4] &quot;Work: XXX-XXX-XXXX; Home: XXX-XXX-XXXX&quot;</span></span></code></pre></div></li>
<li><p><code>str_split_fixed()</code> splits a string into a
<strong>fixed</strong> number of pieces based on a pattern and returns a
character matrix. <code>str_split()</code> splits a string into a
<strong>variable</strong> number of pieces and returns a list of
character vectors.</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="fu">str_split</span>(<span class="st">&quot;a-b-c&quot;</span>, <span class="st">&quot;-&quot;</span>)</span>
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;a&quot; &quot;b&quot; &quot;c&quot;</span></span>
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a><span class="fu">str_split_fixed</span>(<span class="st">&quot;a-b-c&quot;</span>, <span class="st">&quot;-&quot;</span>, <span class="at">n =</span> <span class="dv">2</span>)</span>
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a><span class="co">#&gt; [,1] [,2] </span></span>
<span id="cb19-6"><a href="#cb19-6" tabindex="-1"></a><span class="co">#&gt; [1,] &quot;a&quot; &quot;b-c&quot;</span></span></code></pre></div></li>
</ul>
</div>
<div id="engines" class="section level3">
<h3>Engines</h3>
<p>There are four main engines that stringr can use to describe
patterns:</p>
<ul>
<li><p>Regular expressions, the default, as shown above, and described
in <code>vignette(&quot;regular-expressions&quot;)</code>.</p></li>
<li><p>Fixed bytewise matching, with <code>fixed()</code>.</p></li>
<li><p>Locale-sensitive character matching, with
<code>coll()</code></p></li>
<li><p>Text boundary analysis with <code>boundary()</code>.</p></li>
</ul>
<div id="fixed-matches" class="section level4">
<h4>Fixed matches</h4>
<p><code>fixed(x)</code> only matches the exact sequence of bytes
specified by <code>x</code>. This is a very limited “pattern”, but the
restriction can make matching much faster. Beware using
<code>fixed()</code> with non-English data. It is problematic because
there are often multiple ways of representing the same character. For
example, there are two ways to define “á”: either as a single character
or as an “a” plus an accent:</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a>a1 <span class="ot">&lt;-</span> <span class="st">&quot;\u00e1&quot;</span></span>
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a>a2 <span class="ot">&lt;-</span> <span class="st">&quot;a\u0301&quot;</span></span>
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a><span class="fu">c</span>(a1, a2)</span>
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a><span class="co">#&gt; [1] &quot;á&quot; &quot;&quot;</span></span>
<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a>a1 <span class="sc">==</span> a2</span>
<span id="cb20-6"><a href="#cb20-6" tabindex="-1"></a><span class="co">#&gt; [1] FALSE</span></span></code></pre></div>
<p>They render identically, but because theyre defined differently,
<code>fixed()</code> doesnt find a match. Instead, you can use
<code>coll()</code>, explained below, to respect human character
comparison rules:</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a><span class="fu">str_detect</span>(a1, <span class="fu">fixed</span>(a2))</span>
<span id="cb21-2"><a href="#cb21-2" tabindex="-1"></a><span class="co">#&gt; [1] FALSE</span></span>
<span id="cb21-3"><a href="#cb21-3" tabindex="-1"></a><span class="fu">str_detect</span>(a1, <span class="fu">coll</span>(a2))</span>
<span id="cb21-4"><a href="#cb21-4" tabindex="-1"></a><span class="co">#&gt; [1] TRUE</span></span></code></pre></div>
</div>
<div id="collation-search" class="section level4">
<h4>Collation search</h4>
<p><code>coll(x)</code> looks for a match to <code>x</code> using
human-language <strong>coll</strong>ation rules, and is particularly
important if you want to do case insensitive matching. Collation rules
differ around the world, so youll also need to supply a
<code>locale</code> parameter.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a>i <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;I&quot;</span>, <span class="st">&quot;İ&quot;</span>, <span class="st">&quot;i&quot;</span>, <span class="st">&quot;ı&quot;</span>)</span>
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a>i</span>
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;I&quot; &quot;İ&quot; &quot;i&quot; &quot;ı&quot;</span></span>
<span id="cb22-4"><a href="#cb22-4" tabindex="-1"></a></span>
<span id="cb22-5"><a href="#cb22-5" tabindex="-1"></a><span class="fu">str_subset</span>(i, <span class="fu">coll</span>(<span class="st">&quot;i&quot;</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>))</span>
<span id="cb22-6"><a href="#cb22-6" tabindex="-1"></a><span class="co">#&gt; [1] &quot;I&quot; &quot;i&quot;</span></span>
<span id="cb22-7"><a href="#cb22-7" tabindex="-1"></a><span class="fu">str_subset</span>(i, <span class="fu">coll</span>(<span class="st">&quot;i&quot;</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>, <span class="at">locale =</span> <span class="st">&quot;tr&quot;</span>))</span>
<span id="cb22-8"><a href="#cb22-8" tabindex="-1"></a><span class="co">#&gt; [1] &quot;İ&quot; &quot;i&quot;</span></span></code></pre></div>
<p>The downside of <code>coll()</code> is speed. Because the rules for
recognising which characters are the same are complicated,
<code>coll()</code> is relatively slow compared to <code>regex()</code>
and <code>fixed()</code>. Note that when both <code>fixed()</code> and
<code>regex()</code> have <code>ignore_case</code> arguments, they
perform a much simpler comparison than <code>coll()</code>.</p>
</div>
<div id="boundary" class="section level4">
<h4>Boundary</h4>
<p><code>boundary()</code> matches boundaries between characters, lines,
sentences or words. Its most useful with <code>str_split()</code>, but
can be used with all pattern matching functions:</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="st">&quot;This is a sentence.&quot;</span></span>
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a><span class="fu">str_split</span>(x, <span class="fu">boundary</span>(<span class="st">&quot;word&quot;</span>))</span>
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a><span class="co">#&gt; [1] &quot;This&quot; &quot;is&quot; &quot;a&quot; &quot;sentence&quot;</span></span>
<span id="cb23-5"><a href="#cb23-5" tabindex="-1"></a><span class="fu">str_count</span>(x, <span class="fu">boundary</span>(<span class="st">&quot;word&quot;</span>))</span>
<span id="cb23-6"><a href="#cb23-6" tabindex="-1"></a><span class="co">#&gt; [1] 4</span></span>
<span id="cb23-7"><a href="#cb23-7" tabindex="-1"></a><span class="fu">str_extract_all</span>(x, <span class="fu">boundary</span>(<span class="st">&quot;word&quot;</span>))</span>
<span id="cb23-8"><a href="#cb23-8" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb23-9"><a href="#cb23-9" tabindex="-1"></a><span class="co">#&gt; [1] &quot;This&quot; &quot;is&quot; &quot;a&quot; &quot;sentence&quot;</span></span></code></pre></div>
<p>By convention, <code>&quot;&quot;</code> is treated as
<code>boundary(&quot;character&quot;)</code>:</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">str_split</span>(x, <span class="st">&quot;&quot;</span>)</span>
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="co">#&gt; [[1]]</span></span>
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;T&quot; &quot;h&quot; &quot;i&quot; &quot;s&quot; &quot; &quot; &quot;i&quot; &quot;s&quot; &quot; &quot; &quot;a&quot; &quot; &quot; &quot;s&quot; &quot;e&quot; &quot;n&quot; &quot;t&quot; &quot;e&quot; &quot;n&quot; &quot;c&quot; &quot;e&quot; &quot;.&quot;</span></span>
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a><span class="fu">str_count</span>(x, <span class="st">&quot;&quot;</span>)</span>
<span id="cb24-5"><a href="#cb24-5" tabindex="-1"></a><span class="co">#&gt; [1] 19</span></span></code></pre></div>
</div>
</div>
</div>
<!-- code folding -->
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>