98 lines
3.6 KiB
R
98 lines
3.6 KiB
R
|
{
|
||
|
codepointsToString <- function(x)
|
||
|
parse(keep.source=FALSE, text=dQuote(q="\"\"", paste0(collapse="",
|
||
|
sprintf("\\u%04x", as.integer(x)))))[[1]]
|
||
|
|
||
|
testCharClass <- function(codepoints, class, expected = NULL) {
|
||
|
stopifnot(is.numeric(codepoints))
|
||
|
codepoints <- as.integer(codepoints)
|
||
|
stopifnot(!anyNA(codepoints), all(codepoints > 0))
|
||
|
if (!is.null(expected))
|
||
|
stopifnot(length(codepoints) == length(expected),
|
||
|
is.logical(expected))
|
||
|
|
||
|
result <- list()
|
||
|
result$`charClass(int vs char)` <-
|
||
|
all.equal(charClass(codepoints, class),
|
||
|
charClass(codepointsToString(codepoints), class))
|
||
|
if (!is.null(expected))
|
||
|
result$`expected` <- all.equal(expected,
|
||
|
charClass(codepoints, class))
|
||
|
result <- Filter(Negate(isTRUE), result)
|
||
|
if (length(result)==0) TRUE else result
|
||
|
}
|
||
|
|
||
|
charClasses <- c("alnum", "alpha", "blank", "cntrl", "digit", "graph",
|
||
|
"lower", "print", "punct", "space", "upper", "xdigit")
|
||
|
testCodepoints <- list(
|
||
|
# "\tAB, ab:3", all ASCII
|
||
|
ASCII = c(0x0009, 0x0041, 0x0042, 0x002c, 0x0020, 0x0061, 0x0062,
|
||
|
0x003a, 0x0033),
|
||
|
|
||
|
# "Ivan IV", with Ivan in Cyrillic
|
||
|
Cyrillic = c(0x0418, 0x0432, 0x0430, 0x043d, 0x0020, 0x0049, 0x0056),
|
||
|
|
||
|
# "Shalom", letters are U+05d0 through U+05ea
|
||
|
# the others (at 2, 3 and 6) are diacritical marks
|
||
|
Hebrew = c(0x05E9, 0x05C1, 0x05B8, 0x05DC, 0x05D5, 0x05B9, 0x05DD))
|
||
|
|
||
|
# check for consistency between integer and string inputs
|
||
|
stopifnot(all(unlist((outer(testCodepoints, charClasses,
|
||
|
function(x,y) lapply(seq_along(x),
|
||
|
function(i) testCharClass(x[[i]],y[i])))))))
|
||
|
}
|
||
|
|
||
|
# spot check return values
|
||
|
{
|
||
|
stopifnot(all.equal(
|
||
|
c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
|
||
|
charClass(testCodepoints[["ASCII"]], "blank")))
|
||
|
}
|
||
|
{
|
||
|
stopifnot(all.equal(
|
||
|
c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE),
|
||
|
charClass(testCodepoints[["ASCII"]], "punct")))
|
||
|
}
|
||
|
{
|
||
|
stopifnot(all.equal(
|
||
|
c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE),
|
||
|
charClass(testCodepoints[["ASCII"]], "digit")))
|
||
|
}
|
||
|
{
|
||
|
stopifnot(all.equal(
|
||
|
c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE),
|
||
|
charClass(testCodepoints[["ASCII"]], "alnum")))
|
||
|
}
|
||
|
|
||
|
# In principle, this can be locale dependent.
|
||
|
# Ubuntu in C locale (without internal iswxxxxx) gives different results.
|
||
|
|
||
|
if (Sys.getlocale("LC_CTYPE") != "C") {
|
||
|
|
||
|
stopifnot(all.equal(
|
||
|
c(TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE),
|
||
|
charClass(testCodepoints[["Cyrillic"]], "alpha")))
|
||
|
|
||
|
stopifnot(all.equal(
|
||
|
c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE),
|
||
|
charClass(testCodepoints[["Cyrillic"]], "upper")))
|
||
|
|
||
|
stopifnot(all.equal(
|
||
|
c(FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE),
|
||
|
charClass(testCodepoints[["Cyrillic"]], "lower")))
|
||
|
|
||
|
stopifnot(all.equal(
|
||
|
c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE),
|
||
|
charClass(testCodepoints[["Cyrillic"]], "space")))
|
||
|
|
||
|
# Ubuntu & Windows 10 disagree about diacritacals
|
||
|
stopifnot(all(
|
||
|
charClass(testCodepoints[["Hebrew"]], "alpha")[-c(2,3,6)]))
|
||
|
|
||
|
# no cases in Hebrew alphabet
|
||
|
stopifnot(!any(charClass(testCodepoints[["Hebrew"]], "lower")))
|
||
|
|
||
|
# no cases in Hebrew alphabet
|
||
|
stopifnot(!any(charClass(testCodepoints[["Hebrew"]], "upper")))
|
||
|
}
|