【发布时间】:2019-12-11 02:42:59
【问题描述】:
我有两个排序的数据表(A 和 B),在表 A 中,前 3 列是基因组坐标。 我喜欢从 A 表中提取一些特定的行,但基于 B 表的一些常见列/值。
例如:首先我想匹配A表和B表的'ID'列,然后如果B表的'strand'列中有1值,则选择那些行(从A表)在 A 和 B 表的“值”列和之前的坐标中具有相同的值(第一次匹配)。但是如果 B 表的 'strand' 列中有 - 1 值,则选择 A 和 B 表的 'Value' 列中具有相同值(最后匹配)的那些行(来自 A 表)和坐标后。
mydf1 <- structure(list(chr = c("17", "17", "17", "17", "17", "17", "17", "8", "8", "8", "8", "8", "8", "8", "X", "X", "X", "X", "X", "10", "10", "10", "10", "10", "10", "10", "10", "10"), start = c(50094737L, 50096132L, 50097423L, 50105371L, 50109957L, 50109957L, 50109957L, 22987417L, 22999579L, 23004425L, 23005372L, 23006728L, 23015638L, 23017252L, 119539386L, 119541328L, 119544353L, 119560268L, 119565232L, 14518560L, 14521158L, 14521869L, 14530332L, 14553322L, 14571763L, 14572229L, 14667634L, 14774253L), end = c(50094898L, 50096286L, 50097564L, 50105442L, 50111058L, 50111368L, 50112152L, 22987563L, 23000105L, 23004626L, 23005475L, 23007746L, 23015743L, 23020199L, 119539789L, 119541420L, 119544491L, 119560385L, 119565401L, 14521306L, 14521306L, 14522019L, 14530515L, 14553387L, 14572189L, 14572314L, 14667691L, 14774897L), Value = c(1L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 8L, 8L, 2L, 2L, 1L, 1L, 1L), Tx = c("ENST00000007708", "ENST00000007708", "ENST00000007708,ENST00000503176,ENST00000614357", "ENST00000007708,ENST00000503176,ENST00000614357", "ENST00000007708", "ENST00000614357", "ENST00000503176", "ENST00000519685", "ENST00000251822", "ENST00000251822,ENST00000519685", "ENST00000251822,ENST00000519685", "ENST00000251822,ENST00000519685", "ENST00000251822,ENST00000519685", "ENST00000251822", "ENST00000644802", "ENST00000320339,ENST00000644802", "ENST00000320339,ENST00000644802", "ENST00000320339,ENST00000644802", "ENST00000644802", "ENST00000181796", "ENST00000479731", "ENST00000181796,ENST00000378458,ENST00000378467,ENST00000378470,ENST00000468747,ENST00000478076,ENST00000479731,ENST00000622567", "ENST00000181796,ENST00000378458,ENST00000378467,ENST00000378470,ENST00000468747,ENST00000478076,ENST00000479731,ENST00000622567", "ENST00000378458,ENST00000378467", "ENST00000378458,ENST00000622567", "ENST00000479731", "ENST00000181796", "ENST00000181796"), ID = c("ENSG00000005882", "ENSG00000005882", "ENSG00000005882", "ENSG00000005882", "ENSG00000005882", "ENSG00000005882", "ENSG00000005882", "ENSG00000008853", "ENSG00000008853", "ENSG00000008853", "ENSG00000008853", "ENSG00000008853", "ENSG00000008853", "ENSG00000008853", "ENSG00000018610", "ENSG00000018610", "ENSG00000018610", "ENSG00000018610", "ENSG00000018610", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809", "ENSG00000065809")), class = "data.frame", row.names = c(NA, 28L))
mydf2 <- structure(list(ID = c("ENSG00000005882", "ENSG00000008853", "ENSG00000018610", "ENSG00000065809"), name = c("PDK2", "RHOBTB2", "CXorf56", "FAM107B"), strand = c(1L, 1L, -1L, -1L), Value = c(3L, 2L, 2L, 8L)), class = "data.frame", row.names = c(NA, 4L))
输入 A:
mydf1
chr start end Value Tx ID
17 50094737 50094898 1 ENST00000007708 ENSG00000005882
17 50096132 50096286 2 ENST00000007708 ENSG00000005882
17 50097423 50097564 3 ENST00000007708,ENST00000503176,ENST00000614357 ENSG00000005882
17 50105371 50105442 3 ENST00000007708,ENST00000503176,ENST00000614357 ENSG00000005882
17 50109957 50111058 1 ENST00000007708 ENSG00000005882
17 50109957 50111368 2 ENST00000614357 ENSG00000005882
17 50109957 50112152 1 ENST00000503176 ENSG00000005882
8 22987417 22987563 2 ENST00000519685 ENSG00000008853
8 22999579 23000105 1 ENST00000251822 ENSG00000008853
8 23004425 23004626 2 ENST00000251822,ENST00000519685 ENSG00000008853
8 23005372 23005475 2 ENST00000251822,ENST00000519685 ENSG00000008853
8 23006728 23007746 2 ENST00000251822,ENST00000519685 ENSG00000008853
8 23015638 23015743 2 ENST00000251822,ENST00000519685 ENSG00000008853
8 23017252 23020199 1 ENST00000251822 ENSG00000008853
X 119539386 119539789 1 ENST00000644802 ENSG00000018610
X 119541328 119541420 2 ENST00000320339,ENST00000644802 ENSG00000018610
X 119544353 119544491 2 ENST00000320339,ENST00000644802 ENSG00000018610
X 119560268 119560385 2 ENST00000320339,ENST00000644802 ENSG00000018610
X 119565232 119565401 1 ENST00000644802 ENSG00000018610
10 14518560 14521306 1 ENST00000181796 ENSG00000065809
10 14521158 14521306 1 ENST00000479731 ENSG00000065809
10 14521869 14522019 8 ENST00000181796,ENST00000378458,ENST00000378467,ENST00000378470,ENST00000468747,ENST00000478076,ENST00000479731,ENST00000622567 ENSG00000065809
10 14530332 14530515 8 ENST00000181796,ENST00000378458,ENST00000378467,ENST00000378470,ENST00000468747,ENST00000478076,ENST00000479731,ENST00000622567 ENSG00000065809
10 14553322 14553387 2 ENST00000378458,ENST00000378467 ENSG00000065809
10 14571763 14572189 2 ENST00000378458,ENST00000622567 ENSG00000065809
10 14572229 14572314 1 ENST00000479731 ENSG00000065809
10 14667634 14667691 1 ENST00000181796 ENSG00000065809
10 14774253 14774897 1 ENST00000181796 ENSG00000065809
输入 B:
mydf2
ID name strand Value
ENSG00000005882 PDK2 1 3
ENSG00000008853 RHOBTB2 1 2
ENSG00000018610 CXorf56 -1 2
ENSG00000065809 FAM107B -1 8
愿望输出:
17 50094737 50094898 1 ENST00000007708 ENSG00000005882
17 50096132 50096286 2 ENST00000007708 ENSG00000005882
17 50097423 50097564 3 ENST00000007708,ENST00000503176,ENST00000614357 ENSG00000005882
8 22987417 22987563 2 ENST00000519685 ENSG00000008853
X 119560268 119560385 2 ENST00000320339,ENST00000644802 ENSG00000018610
X 119565232 119565401 1 ENST00000644802 ENSG00000018610
10 14530332 14530515 8 ENST00000181796,ENST00000378458,ENST00000378467,ENST00000378470,ENST00000468747,ENST00000478076,ENST00000479731,ENST00000622567 ENSG00000065809
10 14553322 14553387 2 ENST00000378458,ENST00000378467 ENSG00000065809
10 14571763 14572189 2 ENST00000378458,ENST00000622567 ENSG00000065809
10 14572229 14572314 1 ENST00000479731 ENSG00000065809
10 14667634 14667691 1 ENST00000181796 ENSG00000065809
10 14774253 14774897 1 ENST00000181796 ENSG00000065809
我尝试过使用dyplr 和ifelse,但我无法得到我想要的。
非常感谢任何可能的解决方案!
【问题讨论】:
-
嘿,你能为你的每个表提供一个
dput()输出吗? -
你能用列名代替“第六列”、“第四列”等吗?似乎第一步是
left_join表 A 和 B,这使得跟踪列顺序有点困难,而您正在使用dplyr鼓励独占使用列名。 -
谢谢,我已经更新了列名。
-
你能用“之前/之后的坐标”具体说明你的意思吗?
-
idownvotedbecau.se/unclearquestion idownvotedbecau.se/toomuchcode idownvotedbecau.se/unreadablecode 请把问题说清楚。看起来很有趣,但真的不清楚你在问什么。
标签: r dataframe if-statement dplyr