这应该比目前显示的嵌套循环快得多。在我的机器上,没有任何并行化,它在大约 12.5 分钟内完成了 100,000 个字符串和 50,000 个单词/子字符串。
样本数据:
library("data.table")
# Downloaded from https://raw.githubusercontent.com/datasets/airport-codes/master/data/airport-codes.csv
airports <- fread("airport-codes.csv")
first_bit <- paste(c("Lives", "Works", "Plays", "Condo", "Apartment", "I love"), "in")
places <- unique(c(airports$name, airports[!municipality == "", municipality]))
set.seed(123)
strings <- data.table(
string = paste(sample(first_bit, 1e5, TRUE),
sample(places, 1e5, TRUE))
)
words <- sample(places, 5e4)
基于grepl的实际套路:
system.time({
strings[, `:=`(lower = tolower(string), result = NA_character_)]
words <- words[order(nchar(words), words, decreasing = TRUE)]
i <- 0
for (x in words) {
i <- i + 1
if (i %% 100 == 0) cat(i, "\n")
found <- grepl(tolower(x), strings$lower, fixed = TRUE)
strings[found & is.na(result), result := x]
}
strings[, lower := NULL]
})
注意在 Windows 上,fread 和它的同类在开箱即用的 https 链接上工作,但在 linux 上,您需要使用 download.file 和适当的 curl 或 wget 选项。
编辑 OP 现在表明他只想要整个单词匹配。这可以使用非固定匹配和正则表达式中的\b 语法来实现。然而,这也是一个更快地完成整个事情的机会。
这是一个建议的算法,在我的机器上运行不到一分钟。它在空间边界将每个字符串拆分为单词(首先将多个连续空格压缩为一个之后)。然后它计算由整个单词组成的每个可能子字符串的长度。然后按长度拆分搜索的关键字,match 可用于查找子字符串和关键字之间的精确匹配。因为关键字从大到小排序,所以它总是使用最长的可用关键字。
library("data.table")
library("stringr")
# Downloaded from https://raw.githubusercontent.com/datasets/airport-codes/master/data/airport-codes.csv
airports <- fread("airport-codes.csv")
first_bit <- paste(c("Lives", "Works", "Plays", "Condo", "Apartment", "I love"), "in")
places <- unique(c(airports$name, airports[!municipality == "", municipality]))
set.seed(123)
strings <- data.table(
string = paste(sample(first_bit, 1e5, TRUE),
sample(places, 1e5, TRUE))
)
words <- sample(places, 5e4)
system.time({
strings[, `:=`(lower = tolower(str_replace_all(string, "\\s+", " ")), result = NA_character_, str_no = .I)]
setkey(strings, str_no)
words_dt <- data.table(word = words[order(nchar(words), words, decreasing = TRUE)])
words_dt[, lower := tolower(str_replace_all(word, "\\s+", " "))]
words_dt[, nc := nchar(lower)]
gaps <- str_locate_all(strings$string, "\\S+")
starts <- unlist(lapply(gaps, function(x) x[, 1]))
starts_len <- unlist(lapply(lengths(gaps)/2, seq, 1))
dists <- lapply(seq(gaps), function(i) dist(c(gaps[[i]][, 1], nchar(strings$string[i]) + 2)) - 1)
bits_dt <- data.table(dist = unlist(dists), str_no = rep(strings$str_no, lengths(dists)), start = rep(starts, starts_len), key = "str_no")
setkey(strings, str_no)
for (len in unique(nchar(words))) {
cat(len, "\n")
words_right_length <- words_dt[nc == len]
bits_right_length <- bits_dt[.(strings[is.na(result), str_no])][dist == len]
bits_right_length[, matches := match(substr(strings[str_no, lower], start, start + dist - 1), words_right_length$lower)]
matched <- bits_right_length[, .(first_match = na.omit(matches)[1]), by = str_no][!is.na(first_match)]
if (nrow(matched) > 0) {
matched[, word := words_right_length[first_match, word]]
setkey(matched, str_no)
strings[matched, result := word]
}
}
strings[, `:=`(lower = NULL, str_no = NULL)]
})