如果您想避免使用正则表达式,您可以尝试词干提取。这可以让你找到一些不规则的复数形式。
library(dplyr)
library(tidytext)
library(SnowballC)
wordStem(c("lady", "ladies"))
# [1] "ladi" "ladi"
product_list <- tibble(product = c('banana from ecuador 1 unit',
'argentinian meat (1 kg) cow', 'chicken breast',
'noodles','salad','chicken salad with egg',
'chicken breasts','eggs from chickens'),
id = seq_along(product))
product_to_match <- tibble(product_group = c('cow meat','deer meat',
'cow milk','chicken breast','chicken egg salad','anana'),
pid = seq_along(product_group))
tidytext 包提供了将文档转换为标记/单词的框架。
# convert to tidy word lists
tidy_products <- product_list %>%
unnest_tokens(output = word, input = product)
tidy_products
# # A tibble: 23 x 2
# id word
# <int> <chr>
# 1 1 banana
# 2 1 from
# 3 1 ecuador
# 4 1 1
# 5 1 unit
# 6 2 argentinian
# 7 2 meat
# 8 2 1
# 9 2 kg
# 10 2 cow
# # … with 13 more rows
SnowballC::wordStem 执行截断。
tidy_products <- mutate(tidy_products,
word = wordStem(word))
tail(tidy_products)
# # A tibble: 6 x 2
# id word
# <int> <chr>
# 1 6 egg
# 2 7 chicken
# 3 7 breast
# 4 8 egg
# 5 8 from
# 6 8 chicken
# same processing for products
tidy_match <- product_to_match %>%
unnest_tokens(output = word, input = product_group) %>%
mutate(word = wordStem(word))
从这里您可以检查完整字符串的相等性,例如使用匹配运算符%in%。通过这种方法,我们匹配所有单词是否以任何顺序出现。请注意,产品可能包含在其他产品中,例如牛肉和牛肉汉堡,所以匹配数据框的顺序很重要。
# choose first match
# matchdf must have columns word and pid
first_product_id <- function(string, matchdf) {
out <- NA
for (pid in split(matchdf, f = matchdf$pid)) {
is_in <- pid$word %in% string
if (length(is_in) == 0) { is_in <- FALSE }
if (all(is_in)) {
out <- pid$pid[1]
break
}
}
out
}
first_product_id(string = tidy_products$word[tidy_products$id == 3],
matchdf = tidy_match)
# [1] 4
# look up table where words are in
lut <- tidy_products %>%
group_by(id) %>%
summarise(
pid = first_product_id(string = word, matchdf = tidy_match))
product_list %>%
left_join(lut, by = "id") %>%
left_join(product_to_match, by = "pid")
# # A tibble: 8 x 4
# product id pid product_group
# <chr> <int> <int> <chr>
# 1 banana from ecuador 1 unit 1 NA NA
# 2 argentinian meat (1 kg) cow 2 1 cow meat
# 3 chicken breast 3 4 chicken breast
# 4 noodles 4 NA NA
# 5 salad 5 NA NA
# 6 chicken salad with egg 6 5 chicken egg salad
# 7 chicken breasts 7 4 chicken breast
# 8 eggs from chickens 8 NA NA