655 lines
35 KiB
HTML
655 lines
35 KiB
HTML
<!DOCTYPE html>
|
||
|
||
<html>
|
||
|
||
<head>
|
||
|
||
<meta charset="utf-8" />
|
||
<meta name="generator" content="pandoc" />
|
||
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
|
||
|
||
<title>Unicode: Emoji, accents, and international text</title>
|
||
|
||
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
||
// be compatible with the behavior of Pandoc < 2.8).
|
||
document.addEventListener('DOMContentLoaded', function(e) {
|
||
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
||
var i, h, a;
|
||
for (i = 0; i < hs.length; i++) {
|
||
h = hs[i];
|
||
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
||
a = h.attributes;
|
||
while (a.length > 0) h.removeAttribute(a[0].name);
|
||
}
|
||
});
|
||
</script>
|
||
|
||
<style type="text/css">
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
span.underline{text-decoration: underline;}
|
||
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
</style>
|
||
|
||
|
||
|
||
<style type="text/css">
|
||
code {
|
||
white-space: pre;
|
||
}
|
||
.sourceCode {
|
||
overflow: visible;
|
||
}
|
||
</style>
|
||
<style type="text/css" data-origin="pandoc">
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
.sourceCode { overflow: visible; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
color: #aaaaaa;
|
||
}
|
||
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
code span.al { color: #ff0000; font-weight: bold; }
|
||
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||
code span.at { color: #7d9029; }
|
||
code span.bn { color: #40a070; }
|
||
code span.bu { color: #008000; }
|
||
code span.cf { color: #007020; font-weight: bold; }
|
||
code span.ch { color: #4070a0; }
|
||
code span.cn { color: #880000; }
|
||
code span.co { color: #60a0b0; font-style: italic; }
|
||
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||
code span.do { color: #ba2121; font-style: italic; }
|
||
code span.dt { color: #902000; }
|
||
code span.dv { color: #40a070; }
|
||
code span.er { color: #ff0000; font-weight: bold; }
|
||
code span.ex { }
|
||
code span.fl { color: #40a070; }
|
||
code span.fu { color: #06287e; }
|
||
code span.im { color: #008000; font-weight: bold; }
|
||
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||
code span.kw { color: #007020; font-weight: bold; }
|
||
code span.op { color: #666666; }
|
||
code span.ot { color: #007020; }
|
||
code span.pp { color: #bc7a00; }
|
||
code span.sc { color: #4070a0; }
|
||
code span.ss { color: #bb6688; }
|
||
code span.st { color: #4070a0; }
|
||
code span.va { color: #19177c; }
|
||
code span.vs { color: #4070a0; }
|
||
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||
</style>
|
||
<script>
|
||
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
||
(function() {
|
||
var sheets = document.styleSheets;
|
||
for (var i = 0; i < sheets.length; i++) {
|
||
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
||
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
||
var j = 0;
|
||
while (j < rules.length) {
|
||
var rule = rules[j];
|
||
// check if there is a div.sourceCode rule
|
||
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
||
j++;
|
||
continue;
|
||
}
|
||
var style = rule.style.cssText;
|
||
// check if color or background-color is set
|
||
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
||
j++;
|
||
continue;
|
||
}
|
||
// replace div.sourceCode by a pre.sourceCode rule
|
||
sheets[i].deleteRule(j);
|
||
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
||
}
|
||
}
|
||
})();
|
||
</script>
|
||
|
||
|
||
|
||
|
||
<style type="text/css">body {
|
||
background-color: #fff;
|
||
margin: 1em auto;
|
||
max-width: 700px;
|
||
overflow: visible;
|
||
padding-left: 2em;
|
||
padding-right: 2em;
|
||
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
||
font-size: 14px;
|
||
line-height: 1.35;
|
||
}
|
||
#TOC {
|
||
clear: both;
|
||
margin: 0 0 10px 10px;
|
||
padding: 4px;
|
||
width: 400px;
|
||
border: 1px solid #CCCCCC;
|
||
border-radius: 5px;
|
||
background-color: #f6f6f6;
|
||
font-size: 13px;
|
||
line-height: 1.3;
|
||
}
|
||
#TOC .toctitle {
|
||
font-weight: bold;
|
||
font-size: 15px;
|
||
margin-left: 5px;
|
||
}
|
||
#TOC ul {
|
||
padding-left: 40px;
|
||
margin-left: -1.5em;
|
||
margin-top: 5px;
|
||
margin-bottom: 5px;
|
||
}
|
||
#TOC ul ul {
|
||
margin-left: -2em;
|
||
}
|
||
#TOC li {
|
||
line-height: 16px;
|
||
}
|
||
table {
|
||
margin: 1em auto;
|
||
border-width: 1px;
|
||
border-color: #DDDDDD;
|
||
border-style: outset;
|
||
border-collapse: collapse;
|
||
}
|
||
table th {
|
||
border-width: 2px;
|
||
padding: 5px;
|
||
border-style: inset;
|
||
}
|
||
table td {
|
||
border-width: 1px;
|
||
border-style: inset;
|
||
line-height: 18px;
|
||
padding: 5px 5px;
|
||
}
|
||
table, table th, table td {
|
||
border-left-style: none;
|
||
border-right-style: none;
|
||
}
|
||
table thead, table tr.even {
|
||
background-color: #f7f7f7;
|
||
}
|
||
p {
|
||
margin: 0.5em 0;
|
||
}
|
||
blockquote {
|
||
background-color: #f6f6f6;
|
||
padding: 0.25em 0.75em;
|
||
}
|
||
hr {
|
||
border-style: solid;
|
||
border: none;
|
||
border-top: 1px solid #777;
|
||
margin: 28px 0;
|
||
}
|
||
dl {
|
||
margin-left: 0;
|
||
}
|
||
dl dd {
|
||
margin-bottom: 13px;
|
||
margin-left: 13px;
|
||
}
|
||
dl dt {
|
||
font-weight: bold;
|
||
}
|
||
ul {
|
||
margin-top: 0;
|
||
}
|
||
ul li {
|
||
list-style: circle outside;
|
||
}
|
||
ul ul {
|
||
margin-bottom: 0;
|
||
}
|
||
pre, code {
|
||
background-color: #f7f7f7;
|
||
border-radius: 3px;
|
||
color: #333;
|
||
white-space: pre-wrap;
|
||
}
|
||
pre {
|
||
border-radius: 3px;
|
||
margin: 5px 0px 10px 0px;
|
||
padding: 10px;
|
||
}
|
||
pre:not([class]) {
|
||
background-color: #f7f7f7;
|
||
}
|
||
code {
|
||
font-family: Consolas, Monaco, 'Courier New', monospace;
|
||
font-size: 85%;
|
||
}
|
||
p > code, li > code {
|
||
padding: 2px 0px;
|
||
}
|
||
div.figure {
|
||
text-align: center;
|
||
}
|
||
img {
|
||
background-color: #FFFFFF;
|
||
padding: 2px;
|
||
border: 1px solid #DDDDDD;
|
||
border-radius: 3px;
|
||
border: 1px solid #CCCCCC;
|
||
margin: 0 5px;
|
||
}
|
||
h1 {
|
||
margin-top: 0;
|
||
font-size: 35px;
|
||
line-height: 40px;
|
||
}
|
||
h2 {
|
||
border-bottom: 4px solid #f7f7f7;
|
||
padding-top: 10px;
|
||
padding-bottom: 2px;
|
||
font-size: 145%;
|
||
}
|
||
h3 {
|
||
border-bottom: 2px solid #f7f7f7;
|
||
padding-top: 10px;
|
||
font-size: 120%;
|
||
}
|
||
h4 {
|
||
border-bottom: 1px solid #f7f7f7;
|
||
margin-left: 8px;
|
||
font-size: 105%;
|
||
}
|
||
h5, h6 {
|
||
border-bottom: 1px solid #ccc;
|
||
font-size: 105%;
|
||
}
|
||
a {
|
||
color: #0033dd;
|
||
text-decoration: none;
|
||
}
|
||
a:hover {
|
||
color: #6666ff; }
|
||
a:visited {
|
||
color: #800080; }
|
||
a:visited:hover {
|
||
color: #BB00BB; }
|
||
a[href^="http:"] {
|
||
text-decoration: underline; }
|
||
a[href^="https:"] {
|
||
text-decoration: underline; }
|
||
|
||
code > span.kw { color: #555; font-weight: bold; }
|
||
code > span.dt { color: #902000; }
|
||
code > span.dv { color: #40a070; }
|
||
code > span.bn { color: #d14; }
|
||
code > span.fl { color: #d14; }
|
||
code > span.ch { color: #d14; }
|
||
code > span.st { color: #d14; }
|
||
code > span.co { color: #888888; font-style: italic; }
|
||
code > span.ot { color: #007020; }
|
||
code > span.al { color: #ff0000; font-weight: bold; }
|
||
code > span.fu { color: #900; font-weight: bold; }
|
||
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
||
</style>
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
<body>
|
||
|
||
|
||
|
||
|
||
<h1 class="title toc-ignore">Unicode: Emoji, accents, and international
|
||
text</h1>
|
||
|
||
|
||
|
||
<div id="character-encoding" class="section level2">
|
||
<h2>Character encoding</h2>
|
||
<p>Before we can analyze a text in R, we first need to get its digital
|
||
representation, a sequence of ones and zeros. In practice this works by
|
||
first choosing an <em>encoding</em> for the text that assigns each
|
||
character a numerical value, and then translating the sequence of
|
||
characters in the text to the corresponding sequence of numbers
|
||
specified by the encoding. Today, most new text is encoded according to
|
||
the <a href="http://unicode.org/charts/">Unicode standard</a>,
|
||
specifically the 8-bit block Unicode Transfer Format, <a href="https://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. Joel Spolsky gives
|
||
a good overview of the situation in an <a href="https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/">essay
|
||
from 2003</a>.</p>
|
||
<p>The software community has mostly moved to UTF-8 as a standard for
|
||
text storage and interchange, but there is still a large volume of text
|
||
in other encodings. Whenever you read a text file into R, you need to
|
||
specify the encoding. If you don’t, R will try to guess the encoding,
|
||
and if it guesses incorrectly, it will wrongly interpret the sequence of
|
||
ones and zeros.</p>
|
||
<p>We will demonstrate the difficulties of encodings with the text of
|
||
Jane Austen’s novel, <em>Mansfield Park</em> provided by <a href="http://www.gutenberg.org">Project Gutenberg</a>. We will download
|
||
the text, then read in the lines of the novel.</p>
|
||
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="co"># download the zipped text from a Project Gutenberg mirror</span></span>
|
||
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a>url <span class="ot"><-</span> <span class="st">"http://mirror.csclub.uwaterloo.ca/gutenberg/1/4/141/141.zip"</span></span>
|
||
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a>tmp <span class="ot"><-</span> <span class="fu">tempfile</span>()</span>
|
||
<span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a><span class="fu">download.file</span>(url, tmp)</span>
|
||
<span id="cb1-5"><a href="#cb1-5" tabindex="-1"></a></span>
|
||
<span id="cb1-6"><a href="#cb1-6" tabindex="-1"></a><span class="co"># read the text from the zip file</span></span>
|
||
<span id="cb1-7"><a href="#cb1-7" tabindex="-1"></a>con <span class="ot"><-</span> <span class="fu">unz</span>(tmp, <span class="st">"141.txt"</span>, <span class="at">encoding =</span> <span class="st">"UTF-8"</span>)</span>
|
||
<span id="cb1-8"><a href="#cb1-8" tabindex="-1"></a>lines <span class="ot"><-</span> <span class="fu">readLines</span>(con)</span>
|
||
<span id="cb1-9"><a href="#cb1-9" tabindex="-1"></a><span class="fu">close</span>(con)</span></code></pre></div>
|
||
<p>The <code>unz</code> function and other similar file connection
|
||
functions have <code>encoding</code> arguments which, if left
|
||
unspecified default to assuming that text is encoded in your operating
|
||
system’s native encoding. To ensure consistent behavior across all
|
||
platforms (Mac, Windows, and Linux), you should set this option
|
||
explicitly. Here, we set <code>encoding = "UTF-8"</code>. This is a
|
||
reasonable default, but it is not always appropriate. In general, you
|
||
should determine the appropriate <code>encoding</code> value by looking
|
||
at the file. Unfortunately, the file extension <code>".txt"</code> is
|
||
not informative, and could correspond to any encoding. However, if we
|
||
read the first few lines of the file, we see the following:</p>
|
||
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a>lines[<span class="dv">11</span><span class="sc">:</span><span class="dv">20</span>]</span></code></pre></div>
|
||
<pre><code> [1] "Author: Jane Austen"
|
||
[2] ""
|
||
[3] "Release Date: June, 1994 [Etext #141]"
|
||
[4] "Posting Date: February 11, 2015"
|
||
[5] ""
|
||
[6] "Language: English"
|
||
[7] ""
|
||
[8] "Character set encoding: ASCII"
|
||
[9] ""
|
||
[10] "*** START OF THIS PROJECT GUTENBERG EBOOK MANSFIELD PARK ***"</code></pre>
|
||
<p>The character set encoding is reported as ASCII, which is a subset of
|
||
UTF-8. So, we should be in good shape.</p>
|
||
<p>Unfortunately, we run into trouble as soon as we try to process the
|
||
text:</p>
|
||
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a>corpus<span class="sc">::</span><span class="fu">term_stats</span>(lines) <span class="co"># produces an error</span></span></code></pre></div>
|
||
<pre><code>Error in corpus::term_stats(lines): argument entry 15252 is incorrectly marked as "UTF-8": invalid leading byte (0xA3) at position 36</code></pre>
|
||
<p>The error message tells us that line 15252 contains an invalid
|
||
byte.</p>
|
||
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a>lines[<span class="dv">15252</span>]</span></code></pre></div>
|
||
<pre><code>[1] "the command of her beauty, and her \xa320,000, any one who could satisfy the"</code></pre>
|
||
<p>We might wonder if there are other lines with invalid data. We can
|
||
find all such lines using the <code>utf8_valid</code> function:</p>
|
||
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a>lines[<span class="sc">!</span><span class="fu">utf8_valid</span>(lines)]</span></code></pre></div>
|
||
<pre><code>[1] "the command of her beauty, and her \xa320,000, any one who could satisfy the"</code></pre>
|
||
<p>So, there are no other invalid lines.</p>
|
||
<p>The offending byte in line 15252 is displayed as <code>\xa3</code>,
|
||
an escape code for hexadecimal value 0xa3, decimal value 163. To
|
||
understand why this is invalid, we need to learn more about UTF-8
|
||
encoding.</p>
|
||
</div>
|
||
<div id="utf-8" class="section level2">
|
||
<h2>UTF-8</h2>
|
||
<div id="ascii" class="section level3">
|
||
<h3>ASCII</h3>
|
||
<p>The smallest unit of data transfer on modern computers is the byte, a
|
||
sequence of eight ones and zeros that can encode a number between 0 and
|
||
255 (hexadecimal 0x00 and 0xff). In the earliest character encodings,
|
||
the numbers from 0 to 127 (hexadecimal 0x00 to 0x7f) were standardized
|
||
in an encoding known as ASCII, the American Standard Code for
|
||
Information Interchange. Here are the characters corresponding to these
|
||
codes:</p>
|
||
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>codes <span class="ot"><-</span> <span class="fu">matrix</span>(<span class="dv">0</span><span class="sc">:</span><span class="dv">127</span>, <span class="dv">8</span>, <span class="dv">16</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>,</span>
|
||
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a> <span class="at">dimnames =</span> <span class="fu">list</span>(<span class="dv">0</span><span class="sc">:</span><span class="dv">7</span>, <span class="fu">c</span>(<span class="dv">0</span><span class="sc">:</span><span class="dv">9</span>, letters[<span class="dv">1</span><span class="sc">:</span><span class="dv">6</span>])))</span>
|
||
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a>ascii <span class="ot"><-</span> <span class="fu">apply</span>(codes, <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">2</span>), intToUtf8)</span>
|
||
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a></span>
|
||
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a><span class="co"># replace control codes with ""</span></span>
|
||
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a>ascii[<span class="st">"0"</span>, <span class="fu">c</span>(<span class="dv">0</span><span class="sc">:</span><span class="dv">6</span>, <span class="st">"e"</span>, <span class="st">"f"</span>)] <span class="ot"><-</span> <span class="st">""</span></span>
|
||
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a>ascii[<span class="st">"1"</span>,] <span class="ot"><-</span> <span class="st">""</span></span>
|
||
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a>ascii[<span class="st">"7"</span>, <span class="st">"f"</span>] <span class="ot"><-</span> <span class="st">""</span></span>
|
||
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a></span>
|
||
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a><span class="fu">utf8_print</span>(ascii, <span class="at">quote =</span> <span class="cn">FALSE</span>)</span></code></pre></div>
|
||
<pre><code> 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||
0 \a \b \t \n \v \f \r
|
||
1
|
||
2 ! " # $ % & ' ( ) * + , - . /
|
||
3 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
||
4 @ A B C D E F G H I J K L M N O
|
||
5 P Q R S T U V W X Y Z [ \\ ] ^ _
|
||
6 ` a b c d e f g h i j k l m n o
|
||
7 p q r s t u v w x y z { | } ~ </code></pre>
|
||
<p>The first 32 codes (the first two rows of the table) are special
|
||
control codes, the most common of which, <code>0x0a</code> denotes a new
|
||
line (<code>\n</code>). The special code <code>0x00</code> often denotes
|
||
the end of the input, and R does not allow this value in character
|
||
strings. Code <code>0x7f</code> corresponds to a “delete” control.</p>
|
||
<p>When you call <code>utf8_print</code>, it uses the low level
|
||
<code>utf8_encode</code> subroutine format control codes; they format as
|
||
<code>\uXXXX</code> for four hexadecimal digits <code>XXXX</code> or as
|
||
<code>\UXXXXYYYY</code> for eight hexadecimal digits
|
||
<code>XXXXYYYY</code>:</p>
|
||
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a><span class="fu">utf8_print</span>(<span class="fu">intToUtf8</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">0x0f</span>), <span class="at">quote =</span> <span class="cn">FALSE</span>)</span></code></pre></div>
|
||
<pre><code>[1] \u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f</code></pre>
|
||
<p>Compare <code>utf8_print</code> output with the output with the base
|
||
R print function:</p>
|
||
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">intToUtf8</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">0x0f</span>), <span class="at">quote =</span> <span class="cn">FALSE</span>)</span></code></pre></div>
|
||
<pre><code>[1] \001\002\003\004\005\006\a\b\t\n\v\f\r\016\017</code></pre>
|
||
<p>Base R format control codes below 128 using octal escapes. There are
|
||
some other differences between the function which we will highlight
|
||
below.</p>
|
||
</div>
|
||
<div id="latin-1" class="section level3">
|
||
<h3>Latin-1</h3>
|
||
<p>ASCII works fine for most text in English, but not for other
|
||
languages. The Latin-1 encoding extends ASCII to Latin languages by
|
||
assigning the numbers 128 to 255 (hexadecimal 0x80 to 0xff) to other
|
||
common characters in Latin languages. We can see these characters
|
||
below.</p>
|
||
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>codes <span class="ot"><-</span> <span class="fu">matrix</span>(<span class="dv">128</span><span class="sc">:</span><span class="dv">255</span>, <span class="dv">8</span>, <span class="dv">16</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>,</span>
|
||
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a> <span class="at">dimnames =</span> <span class="fu">list</span>(<span class="fu">c</span>(<span class="dv">8</span><span class="sc">:</span><span class="dv">9</span>, letters[<span class="dv">1</span><span class="sc">:</span><span class="dv">6</span>]), <span class="fu">c</span>(<span class="dv">0</span><span class="sc">:</span><span class="dv">9</span>, letters[<span class="dv">1</span><span class="sc">:</span><span class="dv">6</span>])))</span>
|
||
<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a>latin1 <span class="ot"><-</span> <span class="fu">apply</span>(codes, <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">2</span>), intToUtf8)</span>
|
||
<span id="cb16-4"><a href="#cb16-4" tabindex="-1"></a></span>
|
||
<span id="cb16-5"><a href="#cb16-5" tabindex="-1"></a><span class="co"># replace control codes with ""</span></span>
|
||
<span id="cb16-6"><a href="#cb16-6" tabindex="-1"></a>latin1[<span class="fu">c</span>(<span class="st">"8"</span>, <span class="st">"9"</span>),] <span class="ot"><-</span> <span class="st">""</span></span>
|
||
<span id="cb16-7"><a href="#cb16-7" tabindex="-1"></a></span>
|
||
<span id="cb16-8"><a href="#cb16-8" tabindex="-1"></a><span class="fu">utf8_print</span>(latin1, <span class="at">quote =</span> <span class="cn">FALSE</span>)</span></code></pre></div>
|
||
<pre><code> 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||
8
|
||
9
|
||
a ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ® ¯
|
||
b ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿
|
||
c À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
|
||
d Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
|
||
e à á â ã ä å æ ç è é ê ë ì í î ï
|
||
f ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ</code></pre>
|
||
<p>As with ASCII, the first 32 numbers are control codes. The others are
|
||
characters common in Latin languages. Note that <code>0xa3</code>, the
|
||
invalid byte from <em>Mansfield Park</em>, corresponds to a pound sign
|
||
in the Latin-1 encoding. Given the context of the byte:</p>
|
||
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a>lines[<span class="dv">15252</span>]</span></code></pre></div>
|
||
<pre><code>[1] "the command of her beauty, and her \xa320,000, any one who could satisfy the"</code></pre>
|
||
<p>this is probably the right symbol. The text is probably encoded in
|
||
Latin-1, not UTF-8 or ASCII as claimed in the file.</p>
|
||
<p>If you run into an error while reading text that claims to be ASCII,
|
||
it is probably encoded as Latin-1. Note, however, that this is not the
|
||
only possibility, and there are many other encodings. The
|
||
<code>iconvlist</code> function will list the ones that R knows how to
|
||
process:</p>
|
||
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">iconvlist</span>(), <span class="at">n =</span> <span class="dv">20</span>)</span></code></pre></div>
|
||
<pre><code> [1] "437" "850" "852" "855"
|
||
[5] "857" "860" "861" "862"
|
||
[9] "863" "865" "866" "869"
|
||
[13] "ANSI_X3.4-1968" "ANSI_X3.4-1986" "ARABIC" "ARMSCII-8"
|
||
[17] "ASCII" "ASMO-708" "ATARI" "ATARIST" </code></pre>
|
||
</div>
|
||
<div id="utf-8-1" class="section level3">
|
||
<h3>UTF-8</h3>
|
||
<p>With only 256 unique values, a single byte is not enough to encode
|
||
every character. Multi-byte encodings allow for encoding more. UTF-8
|
||
encodes characters using between 1 and 4 bytes each and allows for up to
|
||
1,112,064 character codes. Most of these codes are currently unassigned,
|
||
but every year the Unicode consortium meets and adds new characters. You
|
||
can find a list of all of the characters in the <a href="http://www.unicode.org/Public/10.0.0/ucd/UnicodeData.txt">Unicode
|
||
Character Database</a>. A listing of the Emoji characters is <a href="http://www.unicode.org/Public/emoji/5.0/emoji-data.txt">available
|
||
separately</a>.</p>
|
||
<p>Say you want to input the Unicode character with hexadecimal code
|
||
0x2603. You can do so in one of three ways:</p>
|
||
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a><span class="st">"\u2603"</span> <span class="co"># with \u + 4 hex digits</span></span></code></pre></div>
|
||
<pre><code>[1] "☃"</code></pre>
|
||
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="st">"\U00002603"</span> <span class="co"># with \U + 8 hex digits</span></span></code></pre></div>
|
||
<pre><code>[1] "☃"</code></pre>
|
||
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" tabindex="-1"></a><span class="fu">intToUtf8</span>(<span class="dv">0x2603</span>) <span class="co"># from an integer</span></span></code></pre></div>
|
||
<pre><code>[1] "☃"</code></pre>
|
||
<p>For characters above <code>0xffff</code>, the first method won’t
|
||
work. On Windows, a bug in the current version of R (fixed in R-devel)
|
||
prevents using the second method.</p>
|
||
<p>When you try to print Unicode in R, the system will first try to
|
||
determine whether the code is printable or not. Non-printable codes
|
||
include control codes and unassigned codes. On Mac OS, R uses an
|
||
outdated function to make this determination, so it is unable to print
|
||
most emoji. The <code>utf8_print</code> function uses the most recent
|
||
version (10.0.0) of the Unicode standard, and will print all Unicode
|
||
characters supported by your system:</p>
|
||
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">intToUtf8</span>(<span class="dv">0x1f600</span> <span class="sc">+</span> <span class="dv">0</span><span class="sc">:</span><span class="dv">79</span>)) <span class="co"># base R</span></span></code></pre></div>
|
||
<pre><code>[1] "\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604\U0001f605\U0001f606\U0001f607\U0001f608\U0001f609\U0001f60a\U0001f60b\U0001f60c\U0001f60d\U0001f60e\U0001f60f\U0001f610\U0001f611\U0001f612\U0001f613\U0001f614\U0001f615\U0001f616\U0001f617\U0001f618\U0001f619\U0001f61a\U0001f61b\U0001f61c\U0001f61d\U0001f61e\U0001f61f\U0001f620\U0001f621\U0001f622\U0001f623\U0001f624\U0001f625\U0001f626\U0001f627\U0001f628\U0001f629\U0001f62a\U0001f62b\U0001f62c\U0001f62d\U0001f62e\U0001f62f\U0001f630\U0001f631\U0001f632\U0001f633\U0001f634\U0001f635\U0001f636\U0001f637\U0001f638\U0001f639\U0001f63a\U0001f63b\U0001f63c\U0001f63d\U0001f63e\U0001f63f\U0001f640\U0001f641\U0001f642\U0001f643\U0001f644\U0001f645\U0001f646\U0001f647\U0001f648\U0001f649\U0001f64a\U0001f64b\U0001f64c\U0001f64d\U0001f64e\U0001f64f"</code></pre>
|
||
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" tabindex="-1"></a><span class="fu">utf8_print</span>(<span class="fu">intToUtf8</span>(<span class="dv">0x1f600</span> <span class="sc">+</span> <span class="dv">0</span><span class="sc">:</span><span class="dv">79</span>)) <span class="co"># truncates to line width</span></span></code></pre></div>
|
||
<pre><code>[1] "😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣…"</code></pre>
|
||
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" tabindex="-1"></a><span class="fu">utf8_print</span>(<span class="fu">intToUtf8</span>(<span class="dv">0x1f600</span> <span class="sc">+</span> <span class="dv">0</span><span class="sc">:</span><span class="dv">79</span>), <span class="at">chars =</span> <span class="dv">500</span>) <span class="co"># increase character limit</span></span></code></pre></div>
|
||
<pre><code>[1] "😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏"</code></pre>
|
||
<p>(Characters with codes above 0xffff, including most emoji, are not
|
||
supported on Windows.)</p>
|
||
<p>The <em>utf8</em> package provides the following utilities for
|
||
validating, formatting, and printing UTF-8 characters:</p>
|
||
<ul>
|
||
<li><p><code>as_utf8()</code> attempts to convert character data to
|
||
UTF-8, throwing an error if the data is invalid;</p></li>
|
||
<li><p><code>utf8_valid()</code> tests whether character data is valid
|
||
according to its declared encoding;</p></li>
|
||
<li><p><code>utf8_normalize()</code> converts text to Unicode composed
|
||
normal form (NFC), optionally applying case-folding and compatibility
|
||
maps;</p></li>
|
||
<li><p><code>utf8_encode()</code> encodes a character string, escaping
|
||
all control characters, so that it can be safely printed to the
|
||
screen;</p></li>
|
||
<li><p><code>utf8_format()</code> formats a character vector by
|
||
truncating to a specified character width limit or by left, right, or
|
||
center justifying;</p></li>
|
||
<li><p><code>utf8_print()</code> prints UTF-8 character data to the
|
||
screen;</p></li>
|
||
<li><p><code>utf8_width()</code> measures the display with of UTF-8
|
||
character strings (many emoji and East Asian characters are twice as
|
||
wide as other characters).</p></li>
|
||
</ul>
|
||
<p>The package does not provide a method to translate from another
|
||
encoding to UTF-8 as the <code>iconv()</code> function from base R
|
||
already serves this purpose.</p>
|
||
</div>
|
||
</div>
|
||
<div id="translating-to-utf-8" class="section level2">
|
||
<h2>Translating to UTF-8</h2>
|
||
<p>Back to our original problem: getting the text of <em>Mansfield
|
||
Park</em> into R. Our first attempt failed:</p>
|
||
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" tabindex="-1"></a>corpus<span class="sc">::</span><span class="fu">term_stats</span>(lines)</span></code></pre></div>
|
||
<pre><code>Error in corpus::term_stats(lines): argument entry 15252 is incorrectly marked as "UTF-8": invalid leading byte (0xA3) at position 36</code></pre>
|
||
<p>We discovered a problem on line 15252:</p>
|
||
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" tabindex="-1"></a>lines[<span class="dv">15252</span>]</span></code></pre></div>
|
||
<pre><code>[1] "the command of her beauty, and her \xa320,000, any one who could satisfy the"</code></pre>
|
||
<p>The text is likely encoded in Latin-1, not UTF-8 (or ASCII) as we had
|
||
originally thought. We can test this by attempting to convert from
|
||
Latin-1 to UTF-8 with the <code>iconv()</code> function and inspecting
|
||
the output:</p>
|
||
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" tabindex="-1"></a>lines2 <span class="ot"><-</span> <span class="fu">iconv</span>(lines, <span class="st">"latin1"</span>, <span class="st">"UTF-8"</span>)</span>
|
||
<span id="cb38-2"><a href="#cb38-2" tabindex="-1"></a>lines2[<span class="dv">15252</span>]</span></code></pre></div>
|
||
<pre><code>[1] "the command of her beauty, and her £20,000, any one who could satisfy the"</code></pre>
|
||
<p>It worked! Now we can analyze our text.</p>
|
||
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" tabindex="-1"></a>f <span class="ot"><-</span> corpus<span class="sc">::</span><span class="fu">text_filter</span>(<span class="at">drop_punct =</span> <span class="cn">TRUE</span>, <span class="at">drop =</span> corpus<span class="sc">::</span>stopwords_en)</span>
|
||
<span id="cb40-2"><a href="#cb40-2" tabindex="-1"></a>corpus<span class="sc">::</span><span class="fu">term_stats</span>(lines2, f)</span></code></pre></div>
|
||
<pre><code> term count support
|
||
1 fanny 816 806
|
||
2 must 508 492
|
||
3 crawford 493 488
|
||
4 mr 482 466
|
||
5 much 459 450
|
||
6 miss 432 419
|
||
7 said 406 400
|
||
8 mrs 408 399
|
||
9 sir 372 366
|
||
10 edmund 364 364
|
||
11 one 370 358
|
||
12 think 349 346
|
||
13 now 333 331
|
||
14 might 324 320
|
||
15 time 310 307
|
||
16 little 309 300
|
||
17 nothing 301 291
|
||
18 well 299 286
|
||
19 thomas 288 285
|
||
20 good 280 275
|
||
⋮ (8450 rows total)</code></pre>
|
||
</div>
|
||
<div id="the-readtext-package" class="section level2">
|
||
<h2>The <em>readtext</em> package</h2>
|
||
<p>If you need more than reading in a single text file, the <a href="https://github.com/quanteda/readtext">readtext</a> package
|
||
supports reading in text in a variety of file formats and encodings.
|
||
Beyond just plain text, that package can read in PDFs, Word documents,
|
||
RTF, and many other formats. (Unfortunately, that package currently
|
||
fails when trying to read in <em>Mansfield Park</em>; the authors are
|
||
aware of the issue and are working on a fix.)</p>
|
||
</div>
|
||
<div id="summary" class="section level2">
|
||
<h2>Summary</h2>
|
||
<p>Text comes in a variety of encodings, and you cannot analyze a text
|
||
without first knowing its encoding. Many functions for reading in text
|
||
assume that it is encoded in UTF-8, but this assumption sometimes fails
|
||
to hold. If you get an error message reporting that your UTF-8 text is
|
||
invalid, use <code>utf8_valid</code> to find the offending texts. Try
|
||
printing the data to the console before and after using
|
||
<code>iconv</code> to convert between character encodings. You can use
|
||
<code>utf8_print</code> to print UTF-8 characters that R refuses to
|
||
display, including emoji characters. For reading in exotic file formats
|
||
like PDF or Word, try the <a href="https://github.com/quanteda/readtext">readtext</a> package.</p>
|
||
</div>
|
||
|
||
|
||
|
||
<!-- code folding -->
|
||
|
||
|
||
<!-- dynamically load mathjax for compatibility with self-contained -->
|
||
<script>
|
||
(function () {
|
||
var script = document.createElement("script");
|
||
script.type = "text/javascript";
|
||
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
||
document.getElementsByTagName("head")[0].appendChild(script);
|
||
})();
|
||
</script>
|
||
|
||
</body>
|
||
</html>
|