【发布时间】:2020-07-09 08:24:03
【问题描述】:
我有一个关于 R 中的数据框操作的问题,该操作根据输出列中用逗号分隔的值提取列名并获取计数。
我有一个输入文件,其中 A 列中包含基因,其他列中包含文献 ID(输入文件示例如下所示)。我想要的是收集所有在输出列中具有value = 1 的文献 ID,并计算计数列中的 ID 数量(输出文件示例如下所示)。发布此消息后,我将使用此输出文件将数据帧与我感兴趣的基因列表使用merge 函数合并。请帮我解决这个问题。
Input_data <- read.csv(file = "./Input.csv", stringsAsFactors = FALSE, check.names = FALSE)
Output_data <- read.csv(file = "./Output.csv", stringsAsFactors = FALSE, check.names = FALSE)
Genes <- read.csv(file = "./Genes.csv", stringsAsFactors = FALSE, check.names = FALSE)
Merge_data <- merge(Output_data, Genes, by = "Genes")
Input_data
dput(Input_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
Output_data
dput(Output_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), Output = c("21906313, 25229268, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 22633082, 19228761, 26955776, 21126355",
"", "20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"", "", "", "", "21906313, 21126355"), Counts = c(5L, 7L, 7L,
6L, 0L, 6L, 7L, 6L, 0L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA,
-13L))
Genes
dput(Genes)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R",
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA,
-23L))
【问题讨论】:
标签: r dataframe merge dplyr tidyr