From 62b659f1d2cacb87375acdeffd415b4ecda9ff16 Mon Sep 17 00:00:00 2001 From: dustinstoltz Date: Fri, 17 Apr 2026 09:14:17 -0400 Subject: [PATCH] Use Any-Latin; Latin-ASCII in replace_non_ascii() to handle non-Latin scripts Fixes #64 Previously, replace_non_ascii() used stri_trans_general(x, 'latin-ascii'), which only transliterated Latin-script characters. Non-Latin scripts (Cyrillic, CJK, Devanagari, etc.) were either left as byte sequences or stripped entirely by remove.nonconverted. Now uses 'Any-Latin; Latin-ASCII' to first transliterate any script to Latin, then Latin to ASCII. This is backwards compatible since Any-Latin is a no-op for already-Latin input. --- R/replace_non_ascii.R | 2 +- tests/testthat/test-replace_non_ascii.R | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100755 tests/testthat/test-replace_non_ascii.R diff --git a/R/replace_non_ascii.R b/R/replace_non_ascii.R index 86878c6..3683a95 100644 --- a/R/replace_non_ascii.R +++ b/R/replace_non_ascii.R @@ -35,7 +35,7 @@ replace_non_ascii <- function (x, replacement = '', remove.nonconverted = TRUE, ...) { x <- replace_curly_quote(x) - x <- stringi::stri_trans_general(x, "latin-ascii") + x <- stringi::stri_trans_general(x, "Any-Latin; Latin-ASCII") x <- iconv(as.character(x), "", "ASCII", "byte") Encoding(x) <- "latin1" x <- mgsub(x, ser, reps) diff --git a/tests/testthat/test-replace_non_ascii.R b/tests/testthat/test-replace_non_ascii.R new file mode 100755 index 0000000..443ee0b --- /dev/null +++ b/tests/testthat/test-replace_non_ascii.R @@ -0,0 +1,23 @@ +context("Checking replace_non_ascii") + +test_that("replace_non_ascii transliterates Latin and non-Latin scripts to ASCII", { + x <- c("heiß", "brûlée", "Дорога", "キャンパス", "भोजन") + Encoding(x) <- "UTF-8" + expect_equal(replace_non_ascii(x), c("heiss", "brulee", "Doroga", "kyanpasu", "bhojana")) +}) + +test_that("replace_non_ascii with remove.nonconverted = FALSE preserves unmapped characters", { + x <- "hello" + expect_equal(replace_non_ascii(x, remove.nonconverted = FALSE), "hello") +}) + +test_that("replace_non_ascii2 replaces non-ASCII with regex", { + x <- "hello world" + expect_equal(replace_non_ascii2(x), "hello world") +}) + +test_that("replace_curly_quote replaces curly quotes", { + z <- '\x93Hello\x94' + Encoding(z) <- "latin1" + expect_equal(replace_curly_quote(z), '"Hello"') +}) \ No newline at end of file