eipi10 提供了一个很好的答案。除此之外,我想将主要尝试的内容留在 data.table 中。首先,我用cSplit() 拆分两列(即SerialNum and Name),用add_rownames() 添加一个索引,然后按索引拆分数据。在第一个lapply() 中,我使用了splitstackshape 包中的Stacked()。我堆叠了 SerialNum 和 Name;如您在temp2 的一部分中所见,分隔的 SeriaNum 和 Name 成为两列。在第二个lapply() 中,我使用了data.table 包中的合并。然后,我删除了带有 NA 的行(lapply(na.omit)),合并了所有数据表(rbindlist),并通过rowname(原始数据的行号)和Name(setorder(rowname, Name))更改了行的顺序)
library(data.table)
library(splitstackshape)
library(dplyr)
cSplit(mydf, c("SerialNum", "Name"), direction = "wide",
type.convert = FALSE, sep = "\n") %>%
add_rownames %>%
split(f = .$rowname) -> temp
#a part of temp
#$`1`
#Source: local data frame [1 x 12]
#
#rowname Year SerialNum_1 SerialNum_2 SerialNum_3 SerialNum_4 SerialNum_5 Name_1 Name_2
#(chr) (dbl) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
#1 1 2015 983 837 424 NA NA Michael Lewis
#Variables not shown: Name_3 (chr), Name_4 (chr), Name_5 (chr)
lapply(temp, function(x){
Stacked(x, var.stubs = c("SerialNum", "Name"), sep = "_")
}) -> temp2
# A part of temp2
#$`1`
#$`1`$SerialNum
# rowname Year .time_1 SerialNum
#1: 1 2015 1 983
#2: 1 2015 2 837
#3: 1 2015 3 424
#4: 1 2015 4 NA
#5: 1 2015 5 NA
#
#$`1`$Name
# rowname Year .time_1 Name
#1: 1 2015 1 Michael
#2: 1 2015 2 Lewis
#3: 1 2015 3 Paul
#4: 1 2015 4 NA
#5: 1 2015 5 NA
lapply(1:nrow(mydf), function(x){
merge(temp2[[x]]$SerialNum, temp2[[x]]$Name, by = c("rowname", "Year", ".time_1"))
}) %>%
lapply(na.omit) %>%
rbindlist %>%
setorder(rowname, Name) -> out
print(out)
# rowname Year .time_1 SerialNum Name
# 1: 1 2015 2 837 Lewis
# 2: 1 2015 1 983 Michael
# 3: 1 2015 3 424 Paul
# 4: 2 2014 1 123 Elaine
# 5: 2 2014 3 789 George
# 6: 2 2014 2 456 Jerry
# 7: 2 2014 4 136 Kramer
# 8: 3 2010 3 321 George
# 9: 3 2010 1 987 John
#10: 3 2010 2 654 Paul
#11: 3 2010 4 975 Ringo
#12: 4 2015 3 837 Lewis
#13: 4 2015 2 983 Michael
#14: 4 2015 1 424 Paul
#15: 5 2014 3 123 Elaine
#16: 5 2014 2 789 George
#17: 5 2014 1 456 Jerry
#18: 5 2014 4 136 Kramer
数据
mydf <- structure(list(SerialNum = c("983\n837\n424\n ", "123\n456\n789\n136",
"987\n654\n321\n975\n ", "424\n983\n837", "456\n789\n123\n136"
), Year = c(2015, 2014, 2010, 2015, 2014), Name = c("Michael\nLewis\nPaul\n ",
"Elaine\nJerry\nGeorge\nKramer", "John\nPaul\nGeorge\nRingo\nNA",
"Paul\nMichael\nLewis", "Jerry\nGeorge\nElaine\nKramer")), .Names = c("SerialNum",
"Year", "Name"), row.names = c(NA, -5L), class = "data.frame")