154 lines
5.1 KiB
R
154 lines
5.1 KiB
R
## ----setup, include = FALSE---------------------------------------------------
|
|
knitr::opts_chunk$set(
|
|
collapse = TRUE,
|
|
comment = "#>"
|
|
)
|
|
library(stringr)
|
|
|
|
## ----eval = FALSE-------------------------------------------------------------
|
|
# # The regular call:
|
|
# str_extract(fruit, "nana")
|
|
# # Is shorthand for
|
|
# str_extract(fruit, regex("nana"))
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- c("apple", "banana", "pear")
|
|
str_extract(x, "an")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
bananas <- c("banana", "Banana", "BANANA")
|
|
str_detect(bananas, "banana")
|
|
str_detect(bananas, regex("banana", ignore_case = TRUE))
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract(x, ".a.")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_detect("\nX\n", ".X.")
|
|
str_detect("\nX\n", regex(".X.", dotall = TRUE))
|
|
|
|
## -----------------------------------------------------------------------------
|
|
# To create the regular expression, we need \\
|
|
dot <- "\\."
|
|
|
|
# But the expression itself only contains one:
|
|
writeLines(dot)
|
|
|
|
# And this tells R to look for an explicit .
|
|
str_extract(c("abc", "a.c", "bef"), "a\\.c")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- "a\\b"
|
|
writeLines(x)
|
|
|
|
str_extract(x, "\\\\")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- c("a.b.c.d", "aeb")
|
|
starts_with <- "a.b"
|
|
|
|
str_detect(x, paste0("^", starts_with))
|
|
str_detect(x, paste0("^\\Q", starts_with, "\\E"))
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- "a\u0301"
|
|
str_extract(x, ".")
|
|
str_extract(x, "\\X")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract_all("1 + 2 = 3", "\\d+")[[1]]
|
|
|
|
## -----------------------------------------------------------------------------
|
|
# Some Laotian numbers
|
|
str_detect("១២៣", "\\d")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
(text <- "Some \t badly\n\t\tspaced \f text")
|
|
str_replace_all(text, "\\s+", " ")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
(text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”"))
|
|
str_replace_all(text, "\\p{quotation mark}", "'")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract_all("Don't eat that!", "\\w+")[[1]]
|
|
str_split("Don't eat that!", "\\W")[[1]]
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_replace_all("The quick brown fox", "\\b", "_")
|
|
str_replace_all("The quick brown fox", "\\B", "_")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_detect(c("abc", "def", "ghi"), "abc|def")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract(c("grey", "gray"), "gre|ay")
|
|
str_extract(c("grey", "gray"), "gr(e|a)y")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
pattern <- "(..)\\1"
|
|
fruit %>%
|
|
str_subset(pattern)
|
|
|
|
fruit %>%
|
|
str_subset(pattern) %>%
|
|
str_match(pattern)
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_match(c("grey", "gray"), "gr(e|a)y")
|
|
str_match(c("grey", "gray"), "gr(?:e|a)y")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- c("apple", "banana", "pear")
|
|
str_extract(x, "^a")
|
|
str_extract(x, "a$")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- "Line 1\nLine 2\nLine 3\n"
|
|
str_extract_all(x, "^Line..")[[1]]
|
|
str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]]
|
|
str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]]
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
|
|
str_extract(x, "CC?")
|
|
str_extract(x, "CC+")
|
|
str_extract(x, 'C[LX]+')
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract(x, "C{2}")
|
|
str_extract(x, "C{2,}")
|
|
str_extract(x, "C{2,3}")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_extract(x, c("C{2,3}", "C{2,3}?"))
|
|
str_extract(x, c("C[LX]+", "C[LX]+?"))
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_detect("ABC", "(?>A|.B)C")
|
|
str_detect("ABC", "(?:A|.B)C")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
x <- c("1 piece", "2 pieces", "3")
|
|
str_extract(x, "\\d+(?= pieces?)")
|
|
|
|
y <- c("100", "$400")
|
|
str_extract(y, "(?<=\\$)\\d+")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
str_detect("xyz", "x(?#this is a comment)")
|
|
|
|
## -----------------------------------------------------------------------------
|
|
phone <- regex("
|
|
\\(? # optional opening parens
|
|
(\\d{3}) # area code
|
|
\\)? # optional closing parens
|
|
(?:-|\\ )? # optional dash or space
|
|
(\\d{3}) # another three numbers
|
|
(?:-|\\ )? # optional dash or space
|
|
(\\d{3}) # three more numbers
|
|
", comments = TRUE)
|
|
|
|
str_match(c("514-791-8141", "(514) 791 8141"), phone)
|
|
|