非常感谢 akrun 提供的解决方案。它比我以前见过的任何东西都高效。
对性能的快速跟进。定义包含 200 列和 100,000 个观测值的数据集。
N = 1e5
mydf <- data.table(A = 1:N)
for(j in 1:100)
mydf[[paste0("V",j)]] = sample(c("x", "y", "z"),N, replace = T)
for(j in 101:200)
mydf[[paste0("V",j)]] = sample(c("v", "u", "w"),N, replace = T)
那么data.table方式的表现:
start_time <- Sys.time()
nm1 <- names(mydf)[-1]
out <- mydf[, lapply(.SD, function(x) as.data.frame.matrix(table(A, x))), .SDcols = nm1]
end_time <- Sys.time()
end_time - start_time
Time difference of 16.39327 secs
out[1:10,1:10]
V1.x V1.y V1.z V2.x V2.y V2.z V3.x V3.y V3.z V4.x
1: 1 0 0 1 0 0 0 0 1 1
2: 0 0 1 1 0 0 1 0 0 0
3: 0 1 0 0 1 0 1 0 0 0
4: 0 0 1 0 1 0 1 0 0 0
5: 0 0 1 0 1 0 0 0 1 1
6: 0 1 0 0 0 1 0 0 1 0
7: 1 0 0 1 0 0 0 1 0 1
8: 0 1 0 1 0 0 0 1 0 1
9: 0 0 1 1 0 0 1 0 0 0
10: 1 0 0 0 1 0 0 1 0 0
do.call 的性能基本相同,但我们还是需要重命名列:
start_time <- Sys.time()
out <- do.call(cbind, lapply(2:201, function(i) table(mydf$A, mydf[[i]]) ))
end_time <- Sys.time()
end_time - start_time
Time difference of 14.72284 secs
out[1:10,1:10]
x y z x y z x y z x
1 1 0 0 1 0 0 0 0 1 1
2 0 0 1 1 0 0 1 0 0 0
3 0 1 0 0 1 0 1 0 0 0
4 0 0 1 0 1 0 1 0 0 0
5 0 0 1 0 1 0 0 0 1 1
6 0 1 0 0 0 1 0 0 1 0
7 1 0 0 1 0 0 0 1 0 1
8 0 1 0 1 0 0 0 1 0 1
9 0 0 1 1 0 0 1 0 0 0
10 1 0 0 0 1 0 0 1 0 0
编辑
Ronak Shah 的替代 tidyverse 解决方案在数据上的效率更高:
start_time <- Sys.time()
mydf2 =
mydf %>%
pivot_longer(cols = starts_with('V')) %>%
unite(name, name, value) %>%
mutate(value = 1) %>%
pivot_wider(values_fill = 0)
end_time <- Sys.time()
end_time - start_time
Time difference of 9.892609 secs
# A tibble: 100,000 x 601
A V1_z V2_y V3_y V4_y V5_z V6_y V7_y V8_y V9_x V10_z V11_x V12_y
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1 1 1 1 1 1 1 1
2 2 0 0 0 0 0 0 0 0 0 1 0 1
3 3 1 0 0 0 0 0 0 0 0 0 0 0
4 4 1 1 0 0 0 1 0 0 0 0 0 0
5 5 1 0 0 0 0 0 0 1 1 0 0 1
6 6 1 1 0 1 0 0 0 0 1 0 0 0
7 7 0 0 0 1 0 1 0 1 1 0 0 0
8 8 0 1 1 0 1 1 1 0 1 1 0 0
9 9 0 0 0 1 1 1 1 1 0 1 1 0
10 10 0 1 0 0 1 0 0 0 0 0 1 0
# … with 99,990 more rows, and 588 more variables: V13_x <dbl>, V14_x <dbl>,
# V15_z <dbl>, V16_z <dbl>, V17_y <dbl>, V18_y <dbl>, V19_z <dbl>,
# V20_x <dbl>, V21_y <dbl>, V22_z <dbl>, V23_z <dbl>, V24_x <dbl>,
# V25_y <dbl>, V26_z <dbl>, V27_z <dbl>, V28_x <dbl>, V29_z <dbl>,
# V30_y <dbl>, V31_x <dbl>, V32_y <dbl>, V33_x <dbl>, V34_y <dbl>,
# V35_y <dbl>, V36_y <dbl>, V37_x <dbl>, V38_y <dbl>, V39_y <dbl>,
# V40_x <dbl>, V41_y <dbl>, V42_z <dbl>, V43_z <dbl>, V44_y <dbl>,
# V45_x <dbl>, V46_x <dbl>, V47_x <dbl>, V48_y <dbl>, V49_y <dbl>,
# V50_z <dbl>, V51_z <dbl>, V52_z <dbl>, V53_z <dbl>, V54_y <dbl>,
# V55_y <dbl>, V56_y <dbl>, V57_x <dbl>, V58_x <dbl>, V59_x <dbl>,
# V60_z <dbl>, V61_y <dbl>, V62_x <dbl>, V63_y <dbl>, V64_y <dbl>,
# V65_y <dbl>, V66_x <dbl>, V67_x <dbl>, V68_x <dbl>, V69_y <dbl>,
# V70_x <dbl>, V71_y <dbl>, V72_y <dbl>, V73_x <dbl>, V74_y <dbl>,
# V75_x <dbl>, V76_z <dbl>, V77_x <dbl>, V78_y <dbl>, V79_z <dbl>,
# V80_z <dbl>, V81_x <dbl>, V82_z <dbl>, V83_x <dbl>, V84_x <dbl>,
# V85_z <dbl>, V86_z <dbl>, V87_x <dbl>, V88_z <dbl>, V89_x <dbl>,
# V90_y <dbl>, V91_y <dbl>, V92_z <dbl>, V93_z <dbl>, V94_z <dbl>,
# V95_z <dbl>, V96_z <dbl>, V97_z <dbl>, V98_y <dbl>, V99_z <dbl>,
# V100_y <dbl>, V101_v <dbl>, V102_w <dbl>, V103_u <dbl>, V104_w <dbl>,
# V105_v <dbl>, V106_v <dbl>, V107_u <dbl>, V108_w <dbl>, V109_v <dbl>,
# V110_v <dbl>, V111_u <dbl>, V112_v <dbl>, …