Remove stringi dependency #936 (#986)

hadley · web-flow · commit f0bcfd929d48 · 2020-07-21T12:26:07.000-05:00
Fixes #936
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,7 +33,6 @@ Imports:
     purrr,
     Rcpp,
     rlang,
-    stringi,
     tibble (>= 2.1.1),
     tidyselect (>= 1.1.0),
     utils,
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # tidyr (development version)
 
+* stringi dependency has been removed; this was a substantial dependency that
+  make tidyr hard to compile in resource constrained environments 
+  (@rjpat, #936).
+
 # tidyr 1.1.0
 
 ## General features
diff --git a/R/extract.R b/R/extract.R
@@ -55,18 +55,14 @@ str_extract <- function(x, into, regex, convert = FALSE) {
     is_character(into)
   )
 
-  matches <- stringi::stri_match_first_regex(x, regex)[, -1, drop = FALSE]
-
-  if (ncol(matches) != length(into)) {
+  out <- str_match_first(x, regex)
+  if (length(out) != length(into)) {
     stop(
       "`regex` should define ", length(into), " groups; ", ncol(matches), " found.",
       call. = FALSE
     )
   }
 
-  out <- as_tibble(matches, .name_repair = "minimal")
-  out <- as.list(out)
-
   # Handle duplicated names
   if (anyDuplicated(into)) {
     pieces <- split(out, into)
@@ -88,3 +84,27 @@ str_extract <- function(x, into, regex, convert = FALSE) {
 
   out
 }
+
+# Helpers -----------------------------------------------------------------
+
+str_match_first <- function(string, regex) {
+  loc <- regexpr(regex, string, perl = TRUE)
+  loc <- group_loc(loc)
+
+  out <- lapply(
+    seq_len(loc$matches),
+    function(i) substr(string, loc$start[, i], loc$end[, i])
+  )
+  out[-1]
+}
+
+group_loc <- function(x) {
+  start <- cbind(as.vector(x), attr(x, "capture.start"))
+  end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L
+
+  no_match <- start == -1L
+  start[no_match] <- NA
+  end[no_match] <- NA
+
+  list(matches = ncol(start), start = start, end = end)
+}
diff --git a/R/pivot-long.R b/R/pivot-long.R
@@ -281,7 +281,7 @@ build_longer_spec <- function(data, cols,
   if (is.null(names_prefix)) {
     names <- names(cols)
   } else {
-    names <- stringi::stri_replace_all_regex(names(cols), paste0("^", names_prefix), "")
+    names <- gsub(paste0("^", names_prefix), "", names(cols))
   }
 
   if (length(names_to) > 1) {
diff --git a/R/separate-rows.R b/R/separate-rows.R
@@ -31,7 +31,7 @@ separate_rows.data.frame <- function(data,
                                      convert = FALSE) {
   vars <- tidyselect::eval_select(expr(c(...)), data)
 
-  out <- purrr::modify_at(data, vars, stringi::stri_split_regex, pattern = sep)
+  out <- purrr::modify_at(data, vars, strsplit, split = sep, perl = TRUE)
   out <- unchop(as_tibble(out), any_of(vars))
   if (convert) {
     out[vars] <- map(out[vars], type.convert, as.is = TRUE)
diff --git a/R/separate.R b/R/separate.R
@@ -108,17 +108,18 @@ str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "
 }
 
 strsep <- function(x, sep) {
-  nchar <- stringi::stri_length(x)
+  nchar <- nchar(x)
   pos <- map(sep, function(i) {
     if (i >= 0) return(i)
     pmax(0, nchar + i)
   })
   pos <- c(list(0), pos, list(nchar))
 
   map(1:(length(pos) - 1), function(i) {
-    stringi::stri_sub(x, pos[[i]] + 1, pos[[i + 1]])
+    substr(x, pos[[i]] + 1, pos[[i + 1]])
   })
 }
+
 str_split_fixed <- function(value, sep, n, extra = "warn", fill = "warn") {
   if (extra == "error") {
     warn(glue(
@@ -132,7 +133,7 @@ str_split_fixed <- function(value, sep, n, extra = "warn", fill = "warn") {
   fill <- arg_match(fill, c("warn", "left", "right"))
 
   n_max <- if (extra == "merge") n else -1L
-  pieces <- stringi::stri_split_regex(value, sep, n_max)
+  pieces <- str_split_n(value, sep, n_max = n_max)
 
   simp <- simplifyPieces(pieces, n, fill == "left")
 
@@ -151,6 +152,24 @@ str_split_fixed <- function(value, sep, n, extra = "warn", fill = "warn") {
   simp$strings
 }
 
+str_split_n <- function(x, pattern, n_max = -1) {
+  m <- gregexpr(pattern, x, perl = TRUE)
+  if (n_max > 0) {
+    m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max))
+  }
+  regmatches(x, m, invert = TRUE)
+}
+
+slice_match <- function(x, i) {
+  structure(
+    x[i],
+    match.length = attr(x, "match.length")[i],
+    index.type = attr(x, "index.type"),
+    useBytes = attr(x, "useBytes")
+  )
+}
+
+
 list_indices <- function(x, max = 20) {
   if (length(x) > max) {
     x <- c(x[seq_len(max)], "...")
diff --git a/data-raw/population.R b/data-raw/population.R
@@ -11,7 +11,7 @@ pop <- read_csv("data-raw/TB_burden_countries_2014-11-07.csv",
 population <- pop %>%
   select(country, year, population = e_pop_num) %>%
   filter(year >= 1995) %>%
-  mutate(country = stringi::stri_trans_general(country, "latin-ascii"))
+  mutate(country = iconv(country, from = "UTF-8", to = "ASCII//TRANSLIT"))
 
 write_csv(population, "data-raw/population.csv")
 save(population, file = "data/population.rdata")
diff --git a/data-raw/who.R b/data-raw/who.R
@@ -10,7 +10,7 @@ who <- who_raw %>%
     new_sn_m014:new_sn_m65, new_sn_f014:new_sn_f65, new_ep_m014:new_ep_m65,
     new_ep_f014:new_ep_f65, newrel_m014:newrel_m65, newrel_f014:newrel_f65
   ) %>%
-  mutate(country = stringi::stri_trans_general(country, "latin-ascii"))
+  mutate(country = iconv(country, from = "UTF-8", to = "ASCII//TRANSLIT"))
 
 write_csv(who, "data-raw/who.csv")
 save(who, file = "data/who.rdata")
diff --git a/tests/testthat/test-extract.R b/tests/testthat/test-extract.R
@@ -52,3 +52,26 @@ test_that("informative error message if wrong number of groups", {
   expect_error(extract(df, x, "y", "."), "should define 1 groups")
   expect_error(extract(df, x, c("y", "z"), "."), "should define 2 groups")
 })
+
+test_that("str_match_first handles edge cases", {
+  expect_identical(
+    str_match_first(c("r-2", "d-2-3-4"), "(.)-(.)"),
+    list(c("r", "d"), c("2", "2"))
+  )
+  expect_identical(
+    str_match_first(NA, "test"),
+    list()
+  )
+  expect_equivalent(
+    str_match_first(c("", " "), "^(.*)$"),
+    list(c("", " "))
+  )
+  expect_equivalent(
+    str_match_first("", "(.)-(.)"),
+    list(NA_character_, NA_character_)
+  )
+  expect_equivalent(
+    str_match_first(character(), "(.)-(.)"),
+    list(character(), character())
+  )
+})
diff --git a/tests/testthat/test-separate.R b/tests/testthat/test-separate.R
@@ -119,6 +119,19 @@ test_that("checks type of `into` and `sep`", {
   )
 })
 
+# helpers -----------------------------------------------------------------
+
+test_that("str_split_n can cap number of splits", {
+  expect_equal(str_split_n(c("x,x"), ",", 1), list("x,x"))
+  expect_equal(str_split_n(c("x,x"), ",", 2), list(c("x", "x")))
+  expect_equal(str_split_n(c("x,x"), ",", 3), list(c("x", "x")))
+})
+
+test_that("str_split_n handles edge cases", {
+  expect_equal(str_split_n(character(), ",", 1), list())
+  expect_equal(str_split_n(NA, ",", 1), list(NA_character_))
+})
+
 test_that("list_indices truncates long warnings", {
   expect_equal(list_indices(letters, max = 3), "a, b, c, ...")
 })

Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ build_longer_spec <- function(data, cols,`
`281`	`281`	`if (is.null(names_prefix)) {`
`282`	`282`	`names <- names(cols)`
`283`	`283`	`} else {`
`284`		`- names <- stringi::stri_replace_all_regex(names(cols), paste0("^", names_prefix), "")`
	`284`	`+ names <- gsub(paste0("^", names_prefix), "", names(cols))`
`285`	`285`	`}`
`286`	`286`
`287`	`287`	`if (length(names_to) > 1) {`