将带有虚拟变量的数据框转换为分类变量答案

【问题标题】：Convert data frame with dummy variables into categorical variables将带有虚拟变量的数据框转换为分类变量
【发布时间】：2019-02-08 02:26:20
【问题描述】：

我需要将虚拟变量转换为分类变量。作为 R 的新手，我只知道如何反其道而行之。有人能指出我正确的方向吗？

数据框是：

data <- data.frame(id=c(1,2,3,4,5,6,7,8,9), 
               red=c("1","0","1","0","1","0","0","0","0"),
               blue=c("1","1","1","1","0","1","1","1","0"),
               yellow=c("0","0","0","0","0","0","0","1","1"))

预期的输出是：

【问题讨论】：

标签： r categorical-data dummy-variable

【解决方案1】：

由于'red'、'blue'、'yellow'列是factor，我们将其强制为numeric，并使用索引替换为Map内的相应列名

data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)], 
                  data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
#  id  c1   c2     c3
#1  1  red blue   None
#2  2 None blue   None
#3  3  red blue   None
#4  4 None blue   None
#5  5  red None   None
#6  6 None blue   None
#7  7 None blue   None
#8  8 None blue yellow
#9  9 None None yellow

或通过更改 levels 来选择其他选项

data[-1] <-  Map(function(x, y) {levels(x) <- c('None', y)
                    x},data[-1], names(data)[-1])

或者使用lapply，我们遍历列序列，提取列，将其更改为numeric，并使用索引将值更改为列名和“无”

data[-1] <- lapply(seq_along(data[-1]), function(i) 
         c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )

注意：给出预期的输出。

或者使用向量化的方法，我们创建一个逻辑矩阵，与列索引相乘并将索引更改为列名

data[-1] <-  `dim<-`(names(data)[-1][col(data[-1]) * 
                       (NA ^(data[-1] == 0))], dim(data[-1]))

或者replace的另一个选项

data[-1] <- replace(as.matrix(data[-1]), data[-1]==1, 
               rep(names(data)[-1], colSums(data[-1] == 1)))

或使用tidyverse

library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
         bind_cols(data[1], .) %>% 
         rename_at(2:4, ~ paste0("c", 1:3))
#  id   c1   c2     c3
#1  1  red blue   none
#2  2 none blue   none
#3  3  red blue   none
#4  4 none blue   none
#5  5  red none   none
#6  6 none blue   none
#7  7 none blue   none
#8  8 none blue yellow
#9  9 none none yellow

或者gather/spread

data %>% 
    gather(key, val, -id) %>% 
    mutate(val = case_when(val == 1 ~ key), 
    key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>% 
    spread(key, val)

基准测试

这里有一些基准

data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
  Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

 })
#   user  system elapsed 
#  0.065   0.014   0.078 


system.time({
 `dim<-`(names(data1)[-1][col(data1[-1]) * 
                       (NA ^(data1[-1] == 0))], dim(data1[-1]))

})
# user  system elapsed 
#  0.387   0.036   0.422 

system.time({
   imap(data1[-1], ~ c('none', .y)[as.numeric(.x)]) 
})
# user  system elapsed 
# 0.047   0.006   0.054 


 system.time({
   lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
  }
  )
 # user  system elapsed 
 #  0.555   0.067   0.621 


system.time({
    ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)

  })
# user  system elapsed 
#  0.711   0.060   0.770

在 1e6 数据集上

data1 <- data[rep(seq_len(nrow(data)), 1e6),] 
system.time({Map( function(x, y) {levels(x) <- c('None', y)
              x},data1[-1], names(data1)[-1])})
#   user  system elapsed 
#    0.123   0.016   0.139 

system.time({
   Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

  })
#   user  system elapsed 
#  0.328   0.074   0.402 
 system.time({
    lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
   }
   )
#   user  system elapsed 
#  7.125   0.463   7.561

带有微基准

library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)], 
                 data1[-1], names(data1)[-1]), 
         ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
          rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr      min        lq      mean    median       uq      max nev 
#ak  6.14964  4.048205  2.401768  1.741373  2.47268  2.43698    10
#ak2  1.00000  1.000000  1.000000  1.000000  1.00000  1.00000    10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278    10

数据

data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L, 
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0", 
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")), 
 class = "data.frame", row.names = c(NA, 
-9L))

【讨论】：

【解决方案2】：

lapply 的一个选项通过忽略第一列 (id)，我们检查哪些列的值为 1 并将其替换为相应的列名，其他的可以更改为 NA。

data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA))

data
#  id  red blue yellow
#1  1  red blue   <NA>
#2  2 <NA> blue   <NA>
#3  3  red blue   <NA>
#4  4 <NA> blue   <NA>
#5  5  red <NA>   <NA>
#6  6 <NA> blue   <NA>
#7  7 <NA> blue   <NA>
#8  8 <NA> blue yellow
#9  9 <NA> <NA> yellow

另一种不使用lapply循环的方法

data[-1] <- ifelse(data[-1] == 1, names(data[-1])[col(data[-1])], NA)


data
#  id  red blue yellow
#1  1  red blue   <NA>
#2  2 <NA> blue   <NA>
#3  3  red blue   <NA>
#4  4 <NA> blue   <NA>
#5  5  red <NA>   <NA>
#6  6 <NA> blue   <NA>
#7  7 <NA> blue   <NA>
#8  8 <NA> blue yellow
#9  9 <NA> <NA> yellow

【讨论】：

【解决方案3】：

这可能吗？

library(tidyverse)

data <- data %>%
  mutate(red = case_when(
    red == 1 ~ "Red", red == 0 ~ "None")) %>% 
  mutate(blue = case_when(
    blue == 1 ~ "Blue", blue == 0 ~ "None")) %>% 
  mutate(yellow = case_when(
    yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))

data

【讨论】：

使用 tidyverse 的好替代方法。也许ifelse 会更简洁？ case_when 如果您最终得到大量嵌套的 ifelse 语句，则效果很好。