将包含相同变量的多列折叠为一列答案

【问题标题】：Collapsing multiple columns containing the same variable into one column将包含相同变量的多列折叠为一列
【发布时间】：2015-04-04 14:04:28
【问题描述】：

我的数据如下所示：

ID   Diagnosis_1   Diagnosis_2   Diagnosis_3   Diagnosis_4
A        1             0             0             0
A        1             0             0             0
A        1             0             0             0
B        0             1             0             0
C        0             0             0             1
C        0             1             0             0
D        0             0             0             1
E        0             0             1             0
E        0             1             0             0
E        0             0             1             0

Diagnosis_1：Diagnosis_4 都是二进制的，代表诊断的存在（1）或不存在（0）。我想做的是创建一个如下所示的数据框：

ID   Diagnosis
A        1
A        1
A        1
B        2
C        4
C        2
D        4
E        3
E        2
E        3

无论我读了多少次关于 reshape/reshape2/tidyr 的文档，我都无法理解它们的实现。

我可以使用 dplyr 的 mutate 来解决我的问题，但这是实现我的目标的一种耗时且迂回的方式。

编辑：编辑数据以更真实地代表我的实际数据框。

【问题讨论】：

标签： r reshape

【解决方案1】：

尝试矩阵乘法：

nc <- ncol(DF)
data.frame(ID = DF$ID, Diagnosis = as.matrix(DF[-1]) %*% seq(nc-1))

给予：

  ID Diagnosis
1  A         1
2  B         2
3  C         2
4  D         4
5  E         3

注意：我们将此用作输入：

Lines <- "ID   Diagnosis_1   Diagnosis_2   Diagnosis_3   Diagnosis_4
A        1             0             0             0
B        0             1             0             0
C        0             1             0             0
D        0             0             0             1
E        0             0             1             0"

DF <- read.table(text = Lines, header = TRUE)

【讨论】：

【解决方案2】：

您可以尝试max.col 获取每行的列索引。

 data.frame(ID=df1$ID, Diagnosis=max.col(df1[-1]))
 #    ID Diagnosis
 #1  A         1
 #2  B         2
 #3  C         2
 #4  D         4
 #5  E         3

或者获取索引的另一个选项是

 unname(which(t(df1[-1])!=0, arr.ind=TRUE)[,1])
 #[1] 1 2 2 4 3

基准测试

 set.seed(25)
 m1 <- matrix(sample(0:1, 1e8*5, replace=TRUE, prob=c(0.9, 0.1)), ncol=5)
 m2 <- m1[rowSums(m1)==1,]
 dim(m2)
 #[1] 32812201        5
 set.seed(395)
 df1 <- data.frame(ID= sample(LETTERS, nrow(m2), replace=TRUE), m2,
        stringsAsFactors=FALSE) 
 colnames(df1)[-1] <- paste('X', 1:5, sep="_")

 Grothendieck <- function() {nc <- ncol(df1)
             data.frame(ID = df1$ID, Diagnosis =
                       as.matrix(df1[-1]) %*% seq(nc-1))}

akrun <- function() {data.frame(ID=df1$ID,
                   Diagnosis=max.col(df1[-1], 'first'))}
ananda <- function() {df1 %>%
                 gather(var, val, -ID) %>%
                 separate(var, into = c("var", "value")) %>%
                 filter(val == 1) %>%
                 select(ID, value)}

 system.time(akrun())
 #  user  system elapsed 
 # 3.690   0.396   4.085 
 system.time(Grothendieck())
 #  user  system elapsed 
 # 3.121   0.459   3.581

在“df1”的较小子集上尝试了dplyr 解决方案，即。 1e6 行

df1 <- df1[1:1e6,]
system.time(ananda())
#   user  system elapsed 
#  6.279   0.177   6.454

使用microbenchmark，

library(microbenchmark)
microbenchmark(akrun(), Grothendieck(), unit='relative', times=20L)
#Unit: relative
#         expr      min       lq     mean   median      uq       max neval cld
 #      akrun() 1.019108 1.252443 1.084306 1.180743 1.16463 0.6928535    20  a
#Grothendieck() 1.000000 1.000000 1.000000 1.000000 1.00000 1.0000000    20  a

数据

 df1 <- structure(list(ID = c("A", "B", "C", "D", "E"),
 Diagnosis_1 = c(1L, 
 0L, 0L, 0L, 0L), Diagnosis_2 = c(0L, 1L, 1L, 0L, 0L), 
 Diagnosis_3 = c(0L, 
 0L, 0L, 0L, 1L), Diagnosis_4 = c(0L, 0L, 0L, 1L, 0L)),
 .Names = c("ID", 
 "Diagnosis_1", "Diagnosis_2", "Diagnosis_3", "Diagnosis_4"),
  class = "data.frame", row.names = c(NA, -5L))

【讨论】：

【解决方案3】：

由于您提到“reshape2”、“tidyr”和相关工具，以下是一些可供考虑的选项：

## Using "tidyr" and "dplyr"
library(dplyr)
library(tidyr)

df1 %>%
  gather(var, val, -ID) %>%
  separate(var, into = c("var", "value")) %>%
  filter(val == 1) %>%
  select(ID, value)
#   ID value
# 1  A     1
# 2  B     2
# 3  C     2
# 4  E     3
# 5  D     4

## Getting half-way there with "melt" from "reshape2"
library(reshape2)
melt(replace(df1, df1 == 0, NA), id.vars = "ID", na.rm = TRUE)
#    ID    variable value
# 1   A Diagnosis_1     1
# 7   B Diagnosis_2     1
# 8   C Diagnosis_2     1
# 15  E Diagnosis_3     1
# 19  D Diagnosis_4     1

考虑到您的更新，您只需要添加一个辅助 ID：

library(dplyr)
library(tidyr)

mydf %>%
  group_by(ID) %>%
  mutate(ID2 = row_number()) %>%
  gather(var, val, Diagnosis_1:Diagnosis_4) %>%
  separate(var, into = c("var", "value")) %>%
  filter(val == 1) %>%
  arrange(ID, ID2)
# Source: local data frame [10 x 5]
# 
#    ID ID2       var value val
# 1   A   1 Diagnosis     1   1
# 2   A   2 Diagnosis     1   1
# 3   A   3 Diagnosis     1   1
# 4   B   1 Diagnosis     2   1
# 5   C   1 Diagnosis     4   1
# 6   C   2 Diagnosis     2   1
# 7   D   1 Diagnosis     4   1
# 8   E   1 Diagnosis     3   1
# 9   E   2 Diagnosis     2   1
# 10  E   3 Diagnosis     3   1

【讨论】：

'var' 和 'val' 指的是什么？
@MakairaMurakami，这些是您在收集列后将创建的两列。
感谢您的解释。不幸的是，当我运行 tidyr/dplyr 命令集时出现以下错误：“错误：值未分成两部分”，然后继续列出一长串数字。
@MakairaMurakami，您的列/不同名称的列是否比您共享的示例中的列多？
是的，我有 5 个诊断列，分别命名为“Diagnosis___1”到“Diagnosis__4”，然后是“Diagnosis__10”以及 ID 列。鉴于您提供的代码，为什么这会有所不同？