由于'red'、'blue'、'yellow'列是factor,我们将其强制为numeric,并使用索引替换为Map内的相应列名
data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)],
data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
# id c1 c2 c3
#1 1 red blue None
#2 2 None blue None
#3 3 red blue None
#4 4 None blue None
#5 5 red None None
#6 6 None blue None
#7 7 None blue None
#8 8 None blue yellow
#9 9 None None yellow
或通过更改 levels 来选择其他选项
data[-1] <- Map(function(x, y) {levels(x) <- c('None', y)
x},data[-1], names(data)[-1])
或者使用lapply,我们遍历列序列,提取列,将其更改为numeric,并使用索引将值更改为列名和“无”
data[-1] <- lapply(seq_along(data[-1]), function(i)
c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )
注意:给出预期的输出。
或者使用向量化的方法,我们创建一个逻辑矩阵,与列索引相乘并将索引更改为列名
data[-1] <- `dim<-`(names(data)[-1][col(data[-1]) *
(NA ^(data[-1] == 0))], dim(data[-1]))
或者replace的另一个选项
data[-1] <- replace(as.matrix(data[-1]), data[-1]==1,
rep(names(data)[-1], colSums(data[-1] == 1)))
或使用tidyverse
library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
bind_cols(data[1], .) %>%
rename_at(2:4, ~ paste0("c", 1:3))
# id c1 c2 c3
#1 1 red blue none
#2 2 none blue none
#3 3 red blue none
#4 4 none blue none
#5 5 red none none
#6 6 none blue none
#7 7 none blue none
#8 8 none blue yellow
#9 9 none none yellow
或者gather/spread
data %>%
gather(key, val, -id) %>%
mutate(val = case_when(val == 1 ~ key),
key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>%
spread(key, val)
基准测试
这里有一些基准
data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.065 0.014 0.078
system.time({
`dim<-`(names(data1)[-1][col(data1[-1]) *
(NA ^(data1[-1] == 0))], dim(data1[-1]))
})
# user system elapsed
# 0.387 0.036 0.422
system.time({
imap(data1[-1], ~ c('none', .y)[as.numeric(.x)])
})
# user system elapsed
# 0.047 0.006 0.054
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 0.555 0.067 0.621
system.time({
ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)
})
# user system elapsed
# 0.711 0.060 0.770
在 1e6 数据集上
data1 <- data[rep(seq_len(nrow(data)), 1e6),]
system.time({Map( function(x, y) {levels(x) <- c('None', y)
x},data1[-1], names(data1)[-1])})
# user system elapsed
# 0.123 0.016 0.139
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.328 0.074 0.402
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 7.125 0.463 7.561
带有微基准
library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1]),
ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr min lq mean median uq max nev
#ak 6.14964 4.048205 2.401768 1.741373 2.47268 2.43698 10
#ak2 1.00000 1.000000 1.000000 1.000000 1.00000 1.00000 10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278 10
数据
data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")),
class = "data.frame", row.names = c(NA,
-9L))