【发布时间】:2019-01-18 16:21:15
【问题描述】:
我正在尝试使用 R 编程语言将特定字符串模式转换为三个不同列的二进制列。
这是我所拥有的:
have <- structure(list(rep1 = c("china", "na", "bay", "eng", "giad",
"china", "sing", "giad", "na", "china", "china, camp", "guat,camp",
"na", "na", "cis", "trans", "stron, mon"), rep2 = c("china",
"na", "bay", "eng", "giad", "china", "sing", "giad", "na", "china",
"china, camp", "camp", "na", "na", "cis", "trans", "stron, mon"
), rep3 = c("na", "na", "bay", "eng", "giad", "china", "sing",
"giad", "china", "china", "china, camp", "camp", "na", "na",
"cis", "trans", "stron, mon")), row.names = c(NA, -17L), class = c("data.table",
"data.frame"))
这就是我想要的:
want <- structure(list(rep1 = c("china", "na", "bay", "eng", "giad",
"china", "sing", "giad", "na", "china", "china, camp", "guat,camp",
"na", "na", "cis", "trans", "stron, mon"), rep2 = c("china",
"na", "bay", "eng", "giad", "china", "sing", "giad", "na", "china",
"china, camp", "camp", "na", "na", "cis", "trans", "stron, mon"
), rep3 = c("na", "na", "bay", "eng", "giad", "china", "sing",
"giad", "china", "china", "china, camp", "camp", "na", "na",
"cis", "trans", "stron, mon"), rep1_chi = c(1, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0), rep2_chi = c(1, 0, 0, 0, 0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0), rep3_chi = c(0, 0, 0, 0,
0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0), rep1_bay = c(0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep2_bay = c(0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep3_bay = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep1_gia = c(0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep2_gia = c(0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep3_gia = c(0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep1_sin = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep2_sin = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rep3_sin = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-17L))
我能够使用ifelse 和stringr::str_detect 创建一个可行的解决方案,如下所示:
want <- have %>% dplyr::select(rep1, rep2, rep3) %>% mutate(
rep1_chi = ifelse(str_detect(rep1,"chi") == T,1,0),
rep2_chi = ifelse(str_detect(rep2,"chi") == T,1,0),
rep3_chi = ifelse(str_detect(rep3,"chi") == T,1,0),
rep1_bay = ifelse(str_detect(rep1,"bay") == T,1,0),
rep2_bay = ifelse(str_detect(rep2,"bay") == T,1,0),
rep3_bay = ifelse(str_detect(rep3,"bay") == T,1,0),
rep1_gia = ifelse(str_detect(rep1,"gia") == T,1,0),
rep2_gia = ifelse(str_detect(rep2,"gia") == T,1,0),
rep3_gia = ifelse(str_detect(rep3,"gia") == T,1,0),
rep1_sin = ifelse(str_detect(rep1,"sin") == T,1,0),
rep2_sin = ifelse(str_detect(rep2,"sin") == T,1,0),
rep3_sin = ifelse(str_detect(rep3,"sin") == T,1,0))
我最大的问题是它似乎相当重复。 我想知道是否有更优雅的解决方案?考虑到“rep”列的数字顺序为 1-3,我认为可能有更好的编程方法。
通过 SO,我发现使用 model.matrix 的 following solution 似乎在您想要每个模式并且只对单个列感兴趣时工作得很好。我试着把它变成一个函数,这样我就可以选择多个列——但我仍然必须删除不感兴趣的模式的字符串。
【问题讨论】: