791 lines
48 KiB
HTML
791 lines
48 KiB
HTML
|
<!DOCTYPE html>
|
|||
|
|
|||
|
<html>
|
|||
|
|
|||
|
<head>
|
|||
|
|
|||
|
<meta charset="utf-8" />
|
|||
|
<meta name="generator" content="pandoc" />
|
|||
|
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
|||
|
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<title>Regular expressions</title>
|
|||
|
|
|||
|
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
|||
|
// be compatible with the behavior of Pandoc < 2.8).
|
|||
|
document.addEventListener('DOMContentLoaded', function(e) {
|
|||
|
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
|||
|
var i, h, a;
|
|||
|
for (i = 0; i < hs.length; i++) {
|
|||
|
h = hs[i];
|
|||
|
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
|||
|
a = h.attributes;
|
|||
|
while (a.length > 0) h.removeAttribute(a[0].name);
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code{white-space: pre-wrap;}
|
|||
|
span.smallcaps{font-variant: small-caps;}
|
|||
|
span.underline{text-decoration: underline;}
|
|||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
|||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
|||
|
ul.task-list{list-style: none;}
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
code {
|
|||
|
white-space: pre;
|
|||
|
}
|
|||
|
.sourceCode {
|
|||
|
overflow: visible;
|
|||
|
}
|
|||
|
</style>
|
|||
|
<style type="text/css" data-origin="pandoc">
|
|||
|
pre > code.sourceCode { white-space: pre; position: relative; }
|
|||
|
pre > code.sourceCode > span { line-height: 1.25; }
|
|||
|
pre > code.sourceCode > span:empty { height: 1.2em; }
|
|||
|
.sourceCode { overflow: visible; }
|
|||
|
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
|||
|
div.sourceCode { margin: 1em 0; }
|
|||
|
pre.sourceCode { margin: 0; }
|
|||
|
@media screen {
|
|||
|
div.sourceCode { overflow: auto; }
|
|||
|
}
|
|||
|
@media print {
|
|||
|
pre > code.sourceCode { white-space: pre-wrap; }
|
|||
|
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
|||
|
}
|
|||
|
pre.numberSource code
|
|||
|
{ counter-reset: source-line 0; }
|
|||
|
pre.numberSource code > span
|
|||
|
{ position: relative; left: -4em; counter-increment: source-line; }
|
|||
|
pre.numberSource code > span > a:first-child::before
|
|||
|
{ content: counter(source-line);
|
|||
|
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
|||
|
border: none; display: inline-block;
|
|||
|
-webkit-touch-callout: none; -webkit-user-select: none;
|
|||
|
-khtml-user-select: none; -moz-user-select: none;
|
|||
|
-ms-user-select: none; user-select: none;
|
|||
|
padding: 0 4px; width: 4em;
|
|||
|
color: #aaaaaa;
|
|||
|
}
|
|||
|
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
|||
|
div.sourceCode
|
|||
|
{ }
|
|||
|
@media screen {
|
|||
|
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
|||
|
}
|
|||
|
code span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.at { color: #7d9029; }
|
|||
|
code span.bn { color: #40a070; }
|
|||
|
code span.bu { color: #008000; }
|
|||
|
code span.cf { color: #007020; font-weight: bold; }
|
|||
|
code span.ch { color: #4070a0; }
|
|||
|
code span.cn { color: #880000; }
|
|||
|
code span.co { color: #60a0b0; font-style: italic; }
|
|||
|
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.do { color: #ba2121; font-style: italic; }
|
|||
|
code span.dt { color: #902000; }
|
|||
|
code span.dv { color: #40a070; }
|
|||
|
code span.er { color: #ff0000; font-weight: bold; }
|
|||
|
code span.ex { }
|
|||
|
code span.fl { color: #40a070; }
|
|||
|
code span.fu { color: #06287e; }
|
|||
|
code span.im { color: #008000; font-weight: bold; }
|
|||
|
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
code span.kw { color: #007020; font-weight: bold; }
|
|||
|
code span.op { color: #666666; }
|
|||
|
code span.ot { color: #007020; }
|
|||
|
code span.pp { color: #bc7a00; }
|
|||
|
code span.sc { color: #4070a0; }
|
|||
|
code span.ss { color: #bb6688; }
|
|||
|
code span.st { color: #4070a0; }
|
|||
|
code span.va { color: #19177c; }
|
|||
|
code span.vs { color: #4070a0; }
|
|||
|
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
|||
|
</style>
|
|||
|
<script>
|
|||
|
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
|||
|
(function() {
|
|||
|
var sheets = document.styleSheets;
|
|||
|
for (var i = 0; i < sheets.length; i++) {
|
|||
|
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
|||
|
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
|||
|
var j = 0;
|
|||
|
while (j < rules.length) {
|
|||
|
var rule = rules[j];
|
|||
|
// check if there is a div.sourceCode rule
|
|||
|
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
var style = rule.style.cssText;
|
|||
|
// check if color or background-color is set
|
|||
|
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
|||
|
j++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
// replace div.sourceCode by a pre.sourceCode rule
|
|||
|
sheets[i].deleteRule(j);
|
|||
|
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
|||
|
}
|
|||
|
}
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<style type="text/css">body {
|
|||
|
background-color: #fff;
|
|||
|
margin: 1em auto;
|
|||
|
max-width: 700px;
|
|||
|
overflow: visible;
|
|||
|
padding-left: 2em;
|
|||
|
padding-right: 2em;
|
|||
|
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
|||
|
font-size: 14px;
|
|||
|
line-height: 1.35;
|
|||
|
}
|
|||
|
#TOC {
|
|||
|
clear: both;
|
|||
|
margin: 0 0 10px 10px;
|
|||
|
padding: 4px;
|
|||
|
width: 400px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
border-radius: 5px;
|
|||
|
background-color: #f6f6f6;
|
|||
|
font-size: 13px;
|
|||
|
line-height: 1.3;
|
|||
|
}
|
|||
|
#TOC .toctitle {
|
|||
|
font-weight: bold;
|
|||
|
font-size: 15px;
|
|||
|
margin-left: 5px;
|
|||
|
}
|
|||
|
#TOC ul {
|
|||
|
padding-left: 40px;
|
|||
|
margin-left: -1.5em;
|
|||
|
margin-top: 5px;
|
|||
|
margin-bottom: 5px;
|
|||
|
}
|
|||
|
#TOC ul ul {
|
|||
|
margin-left: -2em;
|
|||
|
}
|
|||
|
#TOC li {
|
|||
|
line-height: 16px;
|
|||
|
}
|
|||
|
table {
|
|||
|
margin: 1em auto;
|
|||
|
border-width: 1px;
|
|||
|
border-color: #DDDDDD;
|
|||
|
border-style: outset;
|
|||
|
border-collapse: collapse;
|
|||
|
}
|
|||
|
table th {
|
|||
|
border-width: 2px;
|
|||
|
padding: 5px;
|
|||
|
border-style: inset;
|
|||
|
}
|
|||
|
table td {
|
|||
|
border-width: 1px;
|
|||
|
border-style: inset;
|
|||
|
line-height: 18px;
|
|||
|
padding: 5px 5px;
|
|||
|
}
|
|||
|
table, table th, table td {
|
|||
|
border-left-style: none;
|
|||
|
border-right-style: none;
|
|||
|
}
|
|||
|
table thead, table tr.even {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
p {
|
|||
|
margin: 0.5em 0;
|
|||
|
}
|
|||
|
blockquote {
|
|||
|
background-color: #f6f6f6;
|
|||
|
padding: 0.25em 0.75em;
|
|||
|
}
|
|||
|
hr {
|
|||
|
border-style: solid;
|
|||
|
border: none;
|
|||
|
border-top: 1px solid #777;
|
|||
|
margin: 28px 0;
|
|||
|
}
|
|||
|
dl {
|
|||
|
margin-left: 0;
|
|||
|
}
|
|||
|
dl dd {
|
|||
|
margin-bottom: 13px;
|
|||
|
margin-left: 13px;
|
|||
|
}
|
|||
|
dl dt {
|
|||
|
font-weight: bold;
|
|||
|
}
|
|||
|
ul {
|
|||
|
margin-top: 0;
|
|||
|
}
|
|||
|
ul li {
|
|||
|
list-style: circle outside;
|
|||
|
}
|
|||
|
ul ul {
|
|||
|
margin-bottom: 0;
|
|||
|
}
|
|||
|
pre, code {
|
|||
|
background-color: #f7f7f7;
|
|||
|
border-radius: 3px;
|
|||
|
color: #333;
|
|||
|
white-space: pre-wrap;
|
|||
|
}
|
|||
|
pre {
|
|||
|
border-radius: 3px;
|
|||
|
margin: 5px 0px 10px 0px;
|
|||
|
padding: 10px;
|
|||
|
}
|
|||
|
pre:not([class]) {
|
|||
|
background-color: #f7f7f7;
|
|||
|
}
|
|||
|
code {
|
|||
|
font-family: Consolas, Monaco, 'Courier New', monospace;
|
|||
|
font-size: 85%;
|
|||
|
}
|
|||
|
p > code, li > code {
|
|||
|
padding: 2px 0px;
|
|||
|
}
|
|||
|
div.figure {
|
|||
|
text-align: center;
|
|||
|
}
|
|||
|
img {
|
|||
|
background-color: #FFFFFF;
|
|||
|
padding: 2px;
|
|||
|
border: 1px solid #DDDDDD;
|
|||
|
border-radius: 3px;
|
|||
|
border: 1px solid #CCCCCC;
|
|||
|
margin: 0 5px;
|
|||
|
}
|
|||
|
h1 {
|
|||
|
margin-top: 0;
|
|||
|
font-size: 35px;
|
|||
|
line-height: 40px;
|
|||
|
}
|
|||
|
h2 {
|
|||
|
border-bottom: 4px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
padding-bottom: 2px;
|
|||
|
font-size: 145%;
|
|||
|
}
|
|||
|
h3 {
|
|||
|
border-bottom: 2px solid #f7f7f7;
|
|||
|
padding-top: 10px;
|
|||
|
font-size: 120%;
|
|||
|
}
|
|||
|
h4 {
|
|||
|
border-bottom: 1px solid #f7f7f7;
|
|||
|
margin-left: 8px;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
h5, h6 {
|
|||
|
border-bottom: 1px solid #ccc;
|
|||
|
font-size: 105%;
|
|||
|
}
|
|||
|
a {
|
|||
|
color: #0033dd;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
a:hover {
|
|||
|
color: #6666ff; }
|
|||
|
a:visited {
|
|||
|
color: #800080; }
|
|||
|
a:visited:hover {
|
|||
|
color: #BB00BB; }
|
|||
|
a[href^="http:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
a[href^="https:"] {
|
|||
|
text-decoration: underline; }
|
|||
|
|
|||
|
code > span.kw { color: #555; font-weight: bold; }
|
|||
|
code > span.dt { color: #902000; }
|
|||
|
code > span.dv { color: #40a070; }
|
|||
|
code > span.bn { color: #d14; }
|
|||
|
code > span.fl { color: #d14; }
|
|||
|
code > span.ch { color: #d14; }
|
|||
|
code > span.st { color: #d14; }
|
|||
|
code > span.co { color: #888888; font-style: italic; }
|
|||
|
code > span.ot { color: #007020; }
|
|||
|
code > span.al { color: #ff0000; font-weight: bold; }
|
|||
|
code > span.fu { color: #900; font-weight: bold; }
|
|||
|
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
|||
|
</style>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</head>
|
|||
|
|
|||
|
<body>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<h1 class="title toc-ignore">Regular expressions</h1>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<p>Regular expressions are a concise and flexible tool for describing
|
|||
|
patterns in strings. This vignette describes the key features of
|
|||
|
stringr’s regular expressions, as implemented by <a href="https://github.com/gagolews/stringi">stringi</a>. It is not a
|
|||
|
tutorial, so if you’re unfamiliar regular expressions, I’d recommend
|
|||
|
starting at <a href="https://r4ds.had.co.nz/strings.html" class="uri">https://r4ds.had.co.nz/strings.html</a>. If you want to
|
|||
|
master the details, I’d recommend reading the classic <a href="https://www.amazon.com/Mastering-Regular-Expressions-Jeffrey-Friedl/dp/0596528124"><em>Mastering
|
|||
|
Regular Expressions</em></a> by Jeffrey E. F. Friedl.</p>
|
|||
|
<p>Regular expressions are the default pattern engine in stringr. That
|
|||
|
means when you use a pattern matching function with a bare string, it’s
|
|||
|
equivalent to wrapping it in a call to <code>regex()</code>:</p>
|
|||
|
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="co"># The regular call:</span></span>
|
|||
|
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="fu">str_extract</span>(fruit, <span class="st">"nana"</span>)</span>
|
|||
|
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a><span class="co"># Is shorthand for</span></span>
|
|||
|
<span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a><span class="fu">str_extract</span>(fruit, <span class="fu">regex</span>(<span class="st">"nana"</span>))</span></code></pre></div>
|
|||
|
<p>You will need to use <code>regex()</code> explicitly if you want to
|
|||
|
override the default options, as you’ll see in examples below.</p>
|
|||
|
<div id="basic-matches" class="section level2">
|
|||
|
<h2>Basic matches</h2>
|
|||
|
<p>The simplest patterns match exact strings:</p>
|
|||
|
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"apple"</span>, <span class="st">"banana"</span>, <span class="st">"pear"</span>)</span>
|
|||
|
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"an"</span>)</span>
|
|||
|
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="co">#> [1] NA "an" NA</span></span></code></pre></div>
|
|||
|
<p>You can perform a case-insensitive match using
|
|||
|
<code>ignore_case = TRUE</code>:</p>
|
|||
|
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a>bananas <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"banana"</span>, <span class="st">"Banana"</span>, <span class="st">"BANANA"</span>)</span>
|
|||
|
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="fu">str_detect</span>(bananas, <span class="st">"banana"</span>)</span>
|
|||
|
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="co">#> [1] TRUE FALSE FALSE</span></span>
|
|||
|
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a><span class="fu">str_detect</span>(bananas, <span class="fu">regex</span>(<span class="st">"banana"</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>))</span>
|
|||
|
<span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a><span class="co">#> [1] TRUE TRUE TRUE</span></span></code></pre></div>
|
|||
|
<p>The next step up in complexity is <code>.</code>, which matches any
|
|||
|
character except a newline:</p>
|
|||
|
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">".a."</span>)</span>
|
|||
|
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="co">#> [1] NA "ban" "ear"</span></span></code></pre></div>
|
|||
|
<p>You can allow <code>.</code> to match everything, including
|
|||
|
<code>\n</code>, by setting <code>dotall = TRUE</code>:</p>
|
|||
|
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"</span><span class="sc">\n</span><span class="st">X</span><span class="sc">\n</span><span class="st">"</span>, <span class="st">".X."</span>)</span>
|
|||
|
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="co">#> [1] FALSE</span></span>
|
|||
|
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"</span><span class="sc">\n</span><span class="st">X</span><span class="sc">\n</span><span class="st">"</span>, <span class="fu">regex</span>(<span class="st">".X."</span>, <span class="at">dotall =</span> <span class="cn">TRUE</span>))</span>
|
|||
|
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="co">#> [1] TRUE</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="escaping" class="section level2">
|
|||
|
<h2>Escaping</h2>
|
|||
|
<p>If “<code>.</code>” matches any character, how do you match a literal
|
|||
|
“<code>.</code>”? You need to use an “escape” to tell the regular
|
|||
|
expression you want to match it exactly, not use its special behaviour.
|
|||
|
Like strings, regexps use the backslash, <code>\</code>, to escape
|
|||
|
special behaviour. So to match an <code>.</code>, you need the regexp
|
|||
|
<code>\.</code>. Unfortunately this creates a problem. We use strings to
|
|||
|
represent regular expressions, and <code>\</code> is also used as an
|
|||
|
escape symbol in strings. So to create the regular expression
|
|||
|
<code>\.</code> we need the string <code>"\\."</code>.</p>
|
|||
|
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="co"># To create the regular expression, we need \\</span></span>
|
|||
|
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a>dot <span class="ot"><-</span> <span class="st">"</span><span class="sc">\\</span><span class="st">."</span></span>
|
|||
|
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a><span class="co"># But the expression itself only contains one:</span></span>
|
|||
|
<span id="cb6-5"><a href="#cb6-5" tabindex="-1"></a><span class="fu">writeLines</span>(dot)</span>
|
|||
|
<span id="cb6-6"><a href="#cb6-6" tabindex="-1"></a><span class="co">#> \.</span></span>
|
|||
|
<span id="cb6-7"><a href="#cb6-7" tabindex="-1"></a></span>
|
|||
|
<span id="cb6-8"><a href="#cb6-8" tabindex="-1"></a><span class="co"># And this tells R to look for an explicit .</span></span>
|
|||
|
<span id="cb6-9"><a href="#cb6-9" tabindex="-1"></a><span class="fu">str_extract</span>(<span class="fu">c</span>(<span class="st">"abc"</span>, <span class="st">"a.c"</span>, <span class="st">"bef"</span>), <span class="st">"a</span><span class="sc">\\</span><span class="st">.c"</span>)</span>
|
|||
|
<span id="cb6-10"><a href="#cb6-10" tabindex="-1"></a><span class="co">#> [1] NA "a.c" NA</span></span></code></pre></div>
|
|||
|
<p>If <code>\</code> is used as an escape character in regular
|
|||
|
expressions, how do you match a literal <code>\</code>? Well you need to
|
|||
|
escape it, creating the regular expression <code>\\</code>. To create
|
|||
|
that regular expression, you need to use a string, which also needs to
|
|||
|
escape <code>\</code>. That means to match a literal <code>\</code> you
|
|||
|
need to write <code>"\\\\"</code> — you need four backslashes to match
|
|||
|
one!</p>
|
|||
|
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"a</span><span class="sc">\\</span><span class="st">b"</span></span>
|
|||
|
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="fu">writeLines</span>(x)</span>
|
|||
|
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a><span class="co">#> a\b</span></span>
|
|||
|
<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"</span><span class="sc">\\\\</span><span class="st">"</span>)</span>
|
|||
|
<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a><span class="co">#> [1] "\\"</span></span></code></pre></div>
|
|||
|
<p>In this vignette, I use <code>\.</code> to denote the regular
|
|||
|
expression, and <code>"\\."</code> to denote the string that represents
|
|||
|
the regular expression.</p>
|
|||
|
<p>An alternative quoting mechanism is <code>\Q...\E</code>: all the
|
|||
|
characters in <code>...</code> are treated as exact matches. This is
|
|||
|
useful if you want to exactly match user input as part of a regular
|
|||
|
expression.</p>
|
|||
|
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"a.b.c.d"</span>, <span class="st">"aeb"</span>)</span>
|
|||
|
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a>starts_with <span class="ot"><-</span> <span class="st">"a.b"</span></span>
|
|||
|
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a></span>
|
|||
|
<span id="cb8-4"><a href="#cb8-4" tabindex="-1"></a><span class="fu">str_detect</span>(x, <span class="fu">paste0</span>(<span class="st">"^"</span>, starts_with))</span>
|
|||
|
<span id="cb8-5"><a href="#cb8-5" tabindex="-1"></a><span class="co">#> [1] TRUE TRUE</span></span>
|
|||
|
<span id="cb8-6"><a href="#cb8-6" tabindex="-1"></a><span class="fu">str_detect</span>(x, <span class="fu">paste0</span>(<span class="st">"^</span><span class="sc">\\</span><span class="st">Q"</span>, starts_with, <span class="st">"</span><span class="sc">\\</span><span class="st">E"</span>))</span>
|
|||
|
<span id="cb8-7"><a href="#cb8-7" tabindex="-1"></a><span class="co">#> [1] TRUE FALSE</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="special-characters" class="section level2">
|
|||
|
<h2>Special characters</h2>
|
|||
|
<p>Escapes also allow you to specify individual characters that are
|
|||
|
otherwise hard to type. You can specify individual unicode characters in
|
|||
|
five ways, either as a variable number of hex digits (four is most
|
|||
|
common), or by name:</p>
|
|||
|
<ul>
|
|||
|
<li><p><code>\xhh</code>: 2 hex digits.</p></li>
|
|||
|
<li><p><code>\x{hhhh}</code>: 1-6 hex digits.</p></li>
|
|||
|
<li><p><code>\uhhhh</code>: 4 hex digits.</p></li>
|
|||
|
<li><p><code>\Uhhhhhhhh</code>: 8 hex digits.</p></li>
|
|||
|
<li><p><code>\N{name}</code>, e.g. <code>\N{grinning face}</code>
|
|||
|
matches the basic smiling emoji.</p></li>
|
|||
|
</ul>
|
|||
|
<p>Similarly, you can specify many common control characters:</p>
|
|||
|
<ul>
|
|||
|
<li><p><code>\a</code>: bell.</p></li>
|
|||
|
<li><p><code>\cX</code>: match a control-X character.</p></li>
|
|||
|
<li><p><code>\e</code>: escape (<code>\u001B</code>).</p></li>
|
|||
|
<li><p><code>\f</code>: form feed (<code>\u000C</code>).</p></li>
|
|||
|
<li><p><code>\n</code>: line feed (<code>\u000A</code>).</p></li>
|
|||
|
<li><p><code>\r</code>: carriage return (<code>\u000D</code>).</p></li>
|
|||
|
<li><p><code>\t</code>: horizontal tabulation
|
|||
|
(<code>\u0009</code>).</p></li>
|
|||
|
<li><p><code>\0ooo</code> match an octal character. ‘ooo’ is from one to
|
|||
|
three octal digits, from 000 to 0377. The leading zero is
|
|||
|
required.</p></li>
|
|||
|
</ul>
|
|||
|
<p>(Many of these are only of historical interest and are only included
|
|||
|
here for the sake of completeness.)</p>
|
|||
|
</div>
|
|||
|
<div id="matching-multiple-characters" class="section level2">
|
|||
|
<h2>Matching multiple characters</h2>
|
|||
|
<p>There are a number of patterns that match more than one character.
|
|||
|
You’ve already seen <code>.</code>, which matches any character (except
|
|||
|
a newline). A closely related operator is <code>\X</code>, which matches
|
|||
|
a <strong>grapheme cluster</strong>, a set of individual elements that
|
|||
|
form a single symbol. For example, one way of representing “á” is as the
|
|||
|
letter “a” plus an accent: <code>.</code> will match the component “a”,
|
|||
|
while <code>\X</code> will match the complete symbol:</p>
|
|||
|
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"a\u0301"</span></span>
|
|||
|
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"."</span>)</span>
|
|||
|
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a><span class="co">#> [1] "a"</span></span>
|
|||
|
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"</span><span class="sc">\\</span><span class="st">X"</span>)</span>
|
|||
|
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a><span class="co">#> [1] "á"</span></span></code></pre></div>
|
|||
|
<p>There are five other escaped pairs that match narrower classes of
|
|||
|
characters:</p>
|
|||
|
<ul>
|
|||
|
<li><p><code>\d</code>: matches any digit. The complement,
|
|||
|
<code>\D</code>, matches any character that is not a decimal digit.</p>
|
|||
|
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a><span class="fu">str_extract_all</span>(<span class="st">"1 + 2 = 3"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">d+"</span>)[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a><span class="co">#> [1] "1" "2" "3"</span></span></code></pre></div>
|
|||
|
<p>Technically, <code>\d</code> includes any character in the Unicode
|
|||
|
Category of Nd (“Number, Decimal Digit”), which also includes numeric
|
|||
|
symbols from other languages:</p>
|
|||
|
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a><span class="co"># Some Laotian numbers</span></span>
|
|||
|
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"១២៣"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">d"</span>)</span>
|
|||
|
<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a><span class="co">#> [1] TRUE</span></span></code></pre></div></li>
|
|||
|
<li><p><code>\s</code>: matches any whitespace. This includes tabs,
|
|||
|
newlines, form feeds, and any character in the Unicode Z Category (which
|
|||
|
includes a variety of space characters and other separators.). The
|
|||
|
complement, <code>\S</code>, matches any non-whitespace character.</p>
|
|||
|
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a>(text <span class="ot"><-</span> <span class="st">"Some </span><span class="sc">\t</span><span class="st"> badly</span><span class="sc">\n\t\t</span><span class="st">spaced </span><span class="sc">\f</span><span class="st"> text"</span>)</span>
|
|||
|
<span id="cb12-2"><a href="#cb12-2" tabindex="-1"></a><span class="co">#> [1] "Some \t badly\n\t\tspaced \f text"</span></span>
|
|||
|
<span id="cb12-3"><a href="#cb12-3" tabindex="-1"></a><span class="fu">str_replace_all</span>(text, <span class="st">"</span><span class="sc">\\</span><span class="st">s+"</span>, <span class="st">" "</span>)</span>
|
|||
|
<span id="cb12-4"><a href="#cb12-4" tabindex="-1"></a><span class="co">#> [1] "Some badly spaced text"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>\p{property name}</code> matches any character with
|
|||
|
specific unicode property, like <code>\p{Uppercase}</code> or
|
|||
|
<code>\p{Diacritic}</code>. The complement,
|
|||
|
<code>\P{property name}</code>, matches all characters without the
|
|||
|
property. A complete list of unicode properties can be found at <a href="http://www.unicode.org/reports/tr44/#Property_Index" class="uri">http://www.unicode.org/reports/tr44/#Property_Index</a>.</p>
|
|||
|
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a>(text <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">'"Double quotes"'</span>, <span class="st">"«Guillemet»"</span>, <span class="st">"“Fancy quotes”"</span>))</span>
|
|||
|
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a><span class="co">#> [1] "\"Double quotes\"" "«Guillemet»" "“Fancy quotes”"</span></span>
|
|||
|
<span id="cb13-3"><a href="#cb13-3" tabindex="-1"></a><span class="fu">str_replace_all</span>(text, <span class="st">"</span><span class="sc">\\</span><span class="st">p{quotation mark}"</span>, <span class="st">"'"</span>)</span>
|
|||
|
<span id="cb13-4"><a href="#cb13-4" tabindex="-1"></a><span class="co">#> [1] "'Double quotes'" "'Guillemet'" "'Fancy quotes'"</span></span></code></pre></div></li>
|
|||
|
<li><p><code>\w</code> matches any “word” character, which includes
|
|||
|
alphabetic characters, marks and decimal numbers. The complement,
|
|||
|
<code>\W</code>, matches any non-word character.</p>
|
|||
|
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a><span class="fu">str_extract_all</span>(<span class="st">"Don't eat that!"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">w+"</span>)[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a><span class="co">#> [1] "Don" "t" "eat" "that"</span></span>
|
|||
|
<span id="cb14-3"><a href="#cb14-3" tabindex="-1"></a><span class="fu">str_split</span>(<span class="st">"Don't eat that!"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">W"</span>)[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb14-4"><a href="#cb14-4" tabindex="-1"></a><span class="co">#> [1] "Don" "t" "eat" "that" ""</span></span></code></pre></div>
|
|||
|
<p>Technically, <code>\w</code> also matches connector punctuation,
|
|||
|
<code>\u200c</code> (zero width connector), and <code>\u200d</code>
|
|||
|
(zero width joiner), but these are rarely seen in the wild.</p></li>
|
|||
|
<li><p><code>\b</code> matches word boundaries, the transition between
|
|||
|
word and non-word characters. <code>\B</code> matches the opposite:
|
|||
|
boundaries that have either both word or non-word characters on either
|
|||
|
side.</p>
|
|||
|
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a><span class="fu">str_replace_all</span>(<span class="st">"The quick brown fox"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">b"</span>, <span class="st">"_"</span>)</span>
|
|||
|
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a><span class="co">#> [1] "_The_ _quick_ _brown_ _fox_"</span></span>
|
|||
|
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a><span class="fu">str_replace_all</span>(<span class="st">"The quick brown fox"</span>, <span class="st">"</span><span class="sc">\\</span><span class="st">B"</span>, <span class="st">"_"</span>)</span>
|
|||
|
<span id="cb15-4"><a href="#cb15-4" tabindex="-1"></a><span class="co">#> [1] "T_h_e q_u_i_c_k b_r_o_w_n f_o_x"</span></span></code></pre></div></li>
|
|||
|
</ul>
|
|||
|
<p>You can also create your own <strong>character classes</strong> using
|
|||
|
<code>[]</code>:</p>
|
|||
|
<ul>
|
|||
|
<li><code>[abc]</code>: matches a, b, or c.</li>
|
|||
|
<li><code>[a-z]</code>: matches every character between a and z (in
|
|||
|
Unicode code point order).</li>
|
|||
|
<li><code>[^abc]</code>: matches anything except a, b, or c.</li>
|
|||
|
<li><code>[\^\-]</code>: matches <code>^</code> or <code>-</code>.</li>
|
|||
|
</ul>
|
|||
|
<p>There are a number of pre-built classes that you can use inside
|
|||
|
<code>[]</code>:</p>
|
|||
|
<ul>
|
|||
|
<li><code>[:punct:]</code>: punctuation.</li>
|
|||
|
<li><code>[:alpha:]</code>: letters.</li>
|
|||
|
<li><code>[:lower:]</code>: lowercase letters.</li>
|
|||
|
<li><code>[:upper:]</code>: upperclass letters.</li>
|
|||
|
<li><code>[:digit:]</code>: digits.</li>
|
|||
|
<li><code>[:xdigit:]</code>: hex digits.</li>
|
|||
|
<li><code>[:alnum:]</code>: letters and numbers.</li>
|
|||
|
<li><code>[:cntrl:]</code>: control characters.</li>
|
|||
|
<li><code>[:graph:]</code>: letters, numbers, and punctuation.</li>
|
|||
|
<li><code>[:print:]</code>: letters, numbers, punctuation, and
|
|||
|
whitespace.</li>
|
|||
|
<li><code>[:space:]</code>: space characters (basically equivalent to
|
|||
|
<code>\s</code>).</li>
|
|||
|
<li><code>[:blank:]</code>: space and tab.</li>
|
|||
|
</ul>
|
|||
|
<p>These all go inside the <code>[]</code> for character classes,
|
|||
|
i.e. <code>[[:digit:]AX]</code> matches all digits, A, and X.</p>
|
|||
|
<p>You can also using Unicode properties, like
|
|||
|
<code>[\p{Letter}]</code>, and various set operations, like
|
|||
|
<code>[\p{Letter}--\p{script=latin}]</code>. See
|
|||
|
<code>?"stringi-search-charclass"</code> for details.</p>
|
|||
|
</div>
|
|||
|
<div id="alternation" class="section level2">
|
|||
|
<h2>Alternation</h2>
|
|||
|
<p><code>|</code> is the <strong>alternation</strong> operator, which
|
|||
|
will pick between one or more possible matches. For example,
|
|||
|
<code>abc|def</code> will match <code>abc</code> or
|
|||
|
<code>def</code>:</p>
|
|||
|
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="fu">c</span>(<span class="st">"abc"</span>, <span class="st">"def"</span>, <span class="st">"ghi"</span>), <span class="st">"abc|def"</span>)</span>
|
|||
|
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a><span class="co">#> [1] TRUE TRUE FALSE</span></span></code></pre></div>
|
|||
|
<p>Note that the precedence for <code>|</code> is low:
|
|||
|
<code>abc|def</code> is equivalent to <code>(abc)|(def)</code> not
|
|||
|
<code>ab(c|d)ef</code>.</p>
|
|||
|
</div>
|
|||
|
<div id="grouping" class="section level2">
|
|||
|
<h2>Grouping</h2>
|
|||
|
<p>You can use parentheses to override the default precedence rules:</p>
|
|||
|
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="fu">str_extract</span>(<span class="fu">c</span>(<span class="st">"grey"</span>, <span class="st">"gray"</span>), <span class="st">"gre|ay"</span>)</span>
|
|||
|
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="co">#> [1] "gre" "ay"</span></span>
|
|||
|
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a><span class="fu">str_extract</span>(<span class="fu">c</span>(<span class="st">"grey"</span>, <span class="st">"gray"</span>), <span class="st">"gr(e|a)y"</span>)</span>
|
|||
|
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a><span class="co">#> [1] "grey" "gray"</span></span></code></pre></div>
|
|||
|
<p>Parenthesis also define “groups” that you can refer to with
|
|||
|
<strong>backreferences</strong>, like <code>\1</code>, <code>\2</code>
|
|||
|
etc, and can be extracted with <code>str_match()</code>. For example,
|
|||
|
the following regular expression finds all fruits that have a repeated
|
|||
|
pair of letters:</p>
|
|||
|
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a>pattern <span class="ot"><-</span> <span class="st">"(..)</span><span class="sc">\\</span><span class="st">1"</span></span>
|
|||
|
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a>fruit <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a> <span class="fu">str_subset</span>(pattern)</span>
|
|||
|
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="co">#> [1] "banana"</span></span>
|
|||
|
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a></span>
|
|||
|
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a>fruit <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a> <span class="fu">str_subset</span>(pattern) <span class="sc">%>%</span> </span>
|
|||
|
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a> <span class="fu">str_match</span>(pattern)</span>
|
|||
|
<span id="cb18-9"><a href="#cb18-9" tabindex="-1"></a><span class="co">#> [,1] [,2]</span></span>
|
|||
|
<span id="cb18-10"><a href="#cb18-10" tabindex="-1"></a><span class="co">#> [1,] "anan" "an"</span></span></code></pre></div>
|
|||
|
<p>You can use <code>(?:...)</code>, the non-grouping parentheses, to
|
|||
|
control precedence but not capture the match in a group. This is
|
|||
|
slightly more efficient than capturing parentheses.</p>
|
|||
|
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="fu">str_match</span>(<span class="fu">c</span>(<span class="st">"grey"</span>, <span class="st">"gray"</span>), <span class="st">"gr(e|a)y"</span>)</span>
|
|||
|
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a><span class="co">#> [,1] [,2]</span></span>
|
|||
|
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a><span class="co">#> [1,] "grey" "e" </span></span>
|
|||
|
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a><span class="co">#> [2,] "gray" "a"</span></span>
|
|||
|
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a><span class="fu">str_match</span>(<span class="fu">c</span>(<span class="st">"grey"</span>, <span class="st">"gray"</span>), <span class="st">"gr(?:e|a)y"</span>)</span>
|
|||
|
<span id="cb19-6"><a href="#cb19-6" tabindex="-1"></a><span class="co">#> [,1] </span></span>
|
|||
|
<span id="cb19-7"><a href="#cb19-7" tabindex="-1"></a><span class="co">#> [1,] "grey"</span></span>
|
|||
|
<span id="cb19-8"><a href="#cb19-8" tabindex="-1"></a><span class="co">#> [2,] "gray"</span></span></code></pre></div>
|
|||
|
<p>This is most useful for more complex cases where you need to capture
|
|||
|
matches and control precedence independently.</p>
|
|||
|
</div>
|
|||
|
<div id="anchors" class="section level2">
|
|||
|
<h2>Anchors</h2>
|
|||
|
<p>By default, regular expressions will match any part of a string. It’s
|
|||
|
often useful to <strong>anchor</strong> the regular expression so that
|
|||
|
it matches from the start or end of the string:</p>
|
|||
|
<ul>
|
|||
|
<li><code>^</code> matches the start of string.</li>
|
|||
|
<li><code>$</code> matches the end of the string.</li>
|
|||
|
</ul>
|
|||
|
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"apple"</span>, <span class="st">"banana"</span>, <span class="st">"pear"</span>)</span>
|
|||
|
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"^a"</span>)</span>
|
|||
|
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a><span class="co">#> [1] "a" NA NA</span></span>
|
|||
|
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"a$"</span>)</span>
|
|||
|
<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a><span class="co">#> [1] NA "a" NA</span></span></code></pre></div>
|
|||
|
<p>To match a literal “$” or “^”, you need to escape them,
|
|||
|
<code>\$</code>, and <code>\^</code>.</p>
|
|||
|
<p>For multiline strings, you can use
|
|||
|
<code>regex(multiline = TRUE)</code>. This changes the behaviour of
|
|||
|
<code>^</code> and <code>$</code>, and introduces three new
|
|||
|
operators:</p>
|
|||
|
<ul>
|
|||
|
<li><p><code>^</code> now matches the start of each line.</p></li>
|
|||
|
<li><p><code>$</code> now matches the end of each line.</p></li>
|
|||
|
<li><p><code>\A</code> matches the start of the input.</p></li>
|
|||
|
<li><p><code>\z</code> matches the end of the input.</p></li>
|
|||
|
<li><p><code>\Z</code> matches the end of the input, but before the
|
|||
|
final line terminator, if it exists.</p></li>
|
|||
|
</ul>
|
|||
|
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"Line 1</span><span class="sc">\n</span><span class="st">Line 2</span><span class="sc">\n</span><span class="st">Line 3</span><span class="sc">\n</span><span class="st">"</span></span>
|
|||
|
<span id="cb21-2"><a href="#cb21-2" tabindex="-1"></a><span class="fu">str_extract_all</span>(x, <span class="st">"^Line.."</span>)[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb21-3"><a href="#cb21-3" tabindex="-1"></a><span class="co">#> [1] "Line 1"</span></span>
|
|||
|
<span id="cb21-4"><a href="#cb21-4" tabindex="-1"></a><span class="fu">str_extract_all</span>(x, <span class="fu">regex</span>(<span class="st">"^Line.."</span>, <span class="at">multiline =</span> <span class="cn">TRUE</span>))[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb21-5"><a href="#cb21-5" tabindex="-1"></a><span class="co">#> [1] "Line 1" "Line 2" "Line 3"</span></span>
|
|||
|
<span id="cb21-6"><a href="#cb21-6" tabindex="-1"></a><span class="fu">str_extract_all</span>(x, <span class="fu">regex</span>(<span class="st">"</span><span class="sc">\\</span><span class="st">ALine.."</span>, <span class="at">multiline =</span> <span class="cn">TRUE</span>))[[<span class="dv">1</span>]]</span>
|
|||
|
<span id="cb21-7"><a href="#cb21-7" tabindex="-1"></a><span class="co">#> [1] "Line 1"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="repetition" class="section level2">
|
|||
|
<h2>Repetition</h2>
|
|||
|
<p>You can control how many times a pattern matches with the repetition
|
|||
|
operators:</p>
|
|||
|
<ul>
|
|||
|
<li><code>?</code>: 0 or 1.</li>
|
|||
|
<li><code>+</code>: 1 or more.</li>
|
|||
|
<li><code>*</code>: 0 or more.</li>
|
|||
|
</ul>
|
|||
|
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"1888 is the longest year in Roman numerals: MDCCCLXXXVIII"</span></span>
|
|||
|
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"CC?"</span>)</span>
|
|||
|
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="co">#> [1] "CC"</span></span>
|
|||
|
<span id="cb22-4"><a href="#cb22-4" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"CC+"</span>)</span>
|
|||
|
<span id="cb22-5"><a href="#cb22-5" tabindex="-1"></a><span class="co">#> [1] "CCC"</span></span>
|
|||
|
<span id="cb22-6"><a href="#cb22-6" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">'C[LX]+'</span>)</span>
|
|||
|
<span id="cb22-7"><a href="#cb22-7" tabindex="-1"></a><span class="co">#> [1] "CLXXX"</span></span></code></pre></div>
|
|||
|
<p>Note that the precedence of these operators is high, so you can
|
|||
|
write: <code>colou?r</code> to match either American or British
|
|||
|
spellings. That means most uses will need parentheses, like
|
|||
|
<code>bana(na)+</code>.</p>
|
|||
|
<p>You can also specify the number of matches precisely:</p>
|
|||
|
<ul>
|
|||
|
<li><code>{n}</code>: exactly n</li>
|
|||
|
<li><code>{n,}</code>: n or more</li>
|
|||
|
<li><code>{n,m}</code>: between n and m</li>
|
|||
|
</ul>
|
|||
|
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"C{2}"</span>)</span>
|
|||
|
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a><span class="co">#> [1] "CC"</span></span>
|
|||
|
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"C{2,}"</span>)</span>
|
|||
|
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a><span class="co">#> [1] "CCC"</span></span>
|
|||
|
<span id="cb23-5"><a href="#cb23-5" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"C{2,3}"</span>)</span>
|
|||
|
<span id="cb23-6"><a href="#cb23-6" tabindex="-1"></a><span class="co">#> [1] "CCC"</span></span></code></pre></div>
|
|||
|
<p>By default these matches are “greedy”: they will match the longest
|
|||
|
string possible. You can make them “lazy”, matching the shortest string
|
|||
|
possible by putting a <code>?</code> after them:</p>
|
|||
|
<ul>
|
|||
|
<li><code>??</code>: 0 or 1, prefer 0.</li>
|
|||
|
<li><code>+?</code>: 1 or more, match as few times as possible.</li>
|
|||
|
<li><code>*?</code>: 0 or more, match as few times as possible.</li>
|
|||
|
<li><code>{n,}?</code>: n or more, match as few times as possible.</li>
|
|||
|
<li><code>{n,m}?</code>: between n and m, , match as few times as
|
|||
|
possible, but at least n.</li>
|
|||
|
</ul>
|
|||
|
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="fu">c</span>(<span class="st">"C{2,3}"</span>, <span class="st">"C{2,3}?"</span>))</span>
|
|||
|
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a><span class="co">#> [1] "CCC" "CC"</span></span>
|
|||
|
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="fu">c</span>(<span class="st">"C[LX]+"</span>, <span class="st">"C[LX]+?"</span>))</span>
|
|||
|
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a><span class="co">#> [1] "CLXXX" "CL"</span></span></code></pre></div>
|
|||
|
<p>You can also make the matches possessive by putting a <code>+</code>
|
|||
|
after them, which means that if later parts of the match fail, the
|
|||
|
repetition will not be re-tried with a smaller number of characters.
|
|||
|
This is an advanced feature used to improve performance in worst-case
|
|||
|
scenarios (called “catastrophic backtracking”).</p>
|
|||
|
<ul>
|
|||
|
<li><code>?+</code>: 0 or 1, possessive.</li>
|
|||
|
<li><code>++</code>: 1 or more, possessive.</li>
|
|||
|
<li><code>*+</code>: 0 or more, possessive.</li>
|
|||
|
<li><code>{n}+</code>: exactly n, possessive.</li>
|
|||
|
<li><code>{n,}+</code>: n or more, possessive.</li>
|
|||
|
<li><code>{n,m}+</code>: between n and m, possessive.</li>
|
|||
|
</ul>
|
|||
|
<p>A related concept is the <strong>atomic-match</strong> parenthesis,
|
|||
|
<code>(?>...)</code>. If a later match fails and the engine needs to
|
|||
|
back-track, an atomic match is kept as is: it succeeds or fails as a
|
|||
|
whole. Compare the following two regular expressions:</p>
|
|||
|
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"ABC"</span>, <span class="st">"(?>A|.B)C"</span>)</span>
|
|||
|
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a><span class="co">#> [1] FALSE</span></span>
|
|||
|
<span id="cb25-3"><a href="#cb25-3" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"ABC"</span>, <span class="st">"(?:A|.B)C"</span>)</span>
|
|||
|
<span id="cb25-4"><a href="#cb25-4" tabindex="-1"></a><span class="co">#> [1] TRUE</span></span></code></pre></div>
|
|||
|
<p>The atomic match fails because it matches A, and then the next
|
|||
|
character is a C so it fails. The regular match succeeds because it
|
|||
|
matches A, but then C doesn’t match, so it back-tracks and tries B
|
|||
|
instead.</p>
|
|||
|
</div>
|
|||
|
<div id="look-arounds" class="section level2">
|
|||
|
<h2>Look arounds</h2>
|
|||
|
<p>These assertions look ahead or behind the current match without
|
|||
|
“consuming” any characters (i.e. changing the input position).</p>
|
|||
|
<ul>
|
|||
|
<li><p><code>(?=...)</code>: positive look-ahead assertion. Matches if
|
|||
|
<code>...</code> matches at the current input.</p></li>
|
|||
|
<li><p><code>(?!...)</code>: negative look-ahead assertion. Matches if
|
|||
|
<code>...</code> <strong>does not</strong> match at the current
|
|||
|
input.</p></li>
|
|||
|
<li><p><code>(?<=...)</code>: positive look-behind assertion. Matches
|
|||
|
if <code>...</code> matches text preceding the current position, with
|
|||
|
the last character of the match being the character just before the
|
|||
|
current position. Length must be bounded<br />
|
|||
|
(i.e. no <code>*</code> or <code>+</code>).</p></li>
|
|||
|
<li><p><code>(?<!...)</code>: negative look-behind assertion. Matches
|
|||
|
if <code>...</code> <strong>does not</strong> match text preceding the
|
|||
|
current position. Length must be bounded<br />
|
|||
|
(i.e. no <code>*</code> or <code>+</code>).</p></li>
|
|||
|
</ul>
|
|||
|
<p>These are useful when you want to check that a pattern exists, but
|
|||
|
you don’t want to include it in the result:</p>
|
|||
|
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"1 piece"</span>, <span class="st">"2 pieces"</span>, <span class="st">"3"</span>)</span>
|
|||
|
<span id="cb26-2"><a href="#cb26-2" tabindex="-1"></a><span class="fu">str_extract</span>(x, <span class="st">"</span><span class="sc">\\</span><span class="st">d+(?= pieces?)"</span>)</span>
|
|||
|
<span id="cb26-3"><a href="#cb26-3" tabindex="-1"></a><span class="co">#> [1] "1" "2" NA</span></span>
|
|||
|
<span id="cb26-4"><a href="#cb26-4" tabindex="-1"></a></span>
|
|||
|
<span id="cb26-5"><a href="#cb26-5" tabindex="-1"></a>y <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"100"</span>, <span class="st">"$400"</span>)</span>
|
|||
|
<span id="cb26-6"><a href="#cb26-6" tabindex="-1"></a><span class="fu">str_extract</span>(y, <span class="st">"(?<=</span><span class="sc">\\</span><span class="st">$)</span><span class="sc">\\</span><span class="st">d+"</span>)</span>
|
|||
|
<span id="cb26-7"><a href="#cb26-7" tabindex="-1"></a><span class="co">#> [1] NA "400"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
<div id="comments" class="section level2">
|
|||
|
<h2>Comments</h2>
|
|||
|
<p>There are two ways to include comments in a regular expression. The
|
|||
|
first is with <code>(?#...)</code>:</p>
|
|||
|
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a><span class="fu">str_detect</span>(<span class="st">"xyz"</span>, <span class="st">"x(?#this is a comment)"</span>)</span>
|
|||
|
<span id="cb27-2"><a href="#cb27-2" tabindex="-1"></a><span class="co">#> [1] TRUE</span></span></code></pre></div>
|
|||
|
<p>The second is to use <code>regex(comments = TRUE)</code>. This form
|
|||
|
ignores spaces and newlines, and anything everything after
|
|||
|
<code>#</code>. To match a literal space, you’ll need to escape it:
|
|||
|
<code>"\\ "</code>. This is a useful way of describing complex regular
|
|||
|
expressions:</p>
|
|||
|
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a>phone <span class="ot"><-</span> <span class="fu">regex</span>(<span class="st">"</span></span>
|
|||
|
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a><span class="st"> </span><span class="sc">\\</span><span class="st">(? # optional opening parens</span></span>
|
|||
|
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a><span class="st"> (</span><span class="sc">\\</span><span class="st">d{3}) # area code</span></span>
|
|||
|
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a><span class="st"> </span><span class="sc">\\</span><span class="st">)? # optional closing parens</span></span>
|
|||
|
<span id="cb28-5"><a href="#cb28-5" tabindex="-1"></a><span class="st"> (?:-|</span><span class="sc">\\</span><span class="st"> )? # optional dash or space</span></span>
|
|||
|
<span id="cb28-6"><a href="#cb28-6" tabindex="-1"></a><span class="st"> (</span><span class="sc">\\</span><span class="st">d{3}) # another three numbers</span></span>
|
|||
|
<span id="cb28-7"><a href="#cb28-7" tabindex="-1"></a><span class="st"> (?:-|</span><span class="sc">\\</span><span class="st"> )? # optional dash or space</span></span>
|
|||
|
<span id="cb28-8"><a href="#cb28-8" tabindex="-1"></a><span class="st"> (</span><span class="sc">\\</span><span class="st">d{3}) # three more numbers</span></span>
|
|||
|
<span id="cb28-9"><a href="#cb28-9" tabindex="-1"></a><span class="st"> "</span>, <span class="at">comments =</span> <span class="cn">TRUE</span>)</span>
|
|||
|
<span id="cb28-10"><a href="#cb28-10" tabindex="-1"></a></span>
|
|||
|
<span id="cb28-11"><a href="#cb28-11" tabindex="-1"></a><span class="fu">str_match</span>(<span class="fu">c</span>(<span class="st">"514-791-8141"</span>, <span class="st">"(514) 791 8141"</span>), phone)</span>
|
|||
|
<span id="cb28-12"><a href="#cb28-12" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] [,4] </span></span>
|
|||
|
<span id="cb28-13"><a href="#cb28-13" tabindex="-1"></a><span class="co">#> [1,] "514-791-814" "514" "791" "814"</span></span>
|
|||
|
<span id="cb28-14"><a href="#cb28-14" tabindex="-1"></a><span class="co">#> [2,] "(514) 791 814" "514" "791" "814"</span></span></code></pre></div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<!-- code folding -->
|
|||
|
|
|||
|
|
|||
|
<!-- dynamically load mathjax for compatibility with self-contained -->
|
|||
|
<script>
|
|||
|
(function () {
|
|||
|
var script = document.createElement("script");
|
|||
|
script.type = "text/javascript";
|
|||
|
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
|||
|
document.getElementsByTagName("head")[0].appendChild(script);
|
|||
|
})();
|
|||
|
</script>
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|