【问题标题】:Apply dcast multiple times for different variables对不同的变量多次应用 dcast
【发布时间】:2020-12-29 10:09:51
【问题描述】:

假设我有一个 ID 为 A 的数据框和几个因子变量。

library(data.table)
library(dplyr)

mydf <- data.table(
  A =  as.factor(c("A", "B", "C", "D","E")),
  V1 = as.factor(c("x", "x", "y", "x", "y")),
  V2 = as.factor(c("u", "v", "w", "v", "u"))
)

在一个真实的例子中,可以有更多的列(比如 1000)并且数据集有几百万行,所以我真的很关心性能。

我想把它转换成下面的data.frame:

  A V1_x V1_y V2_u V2_v V2_w
1 A    1    0    1    0    0
2 B    1    0    0    1    0
3 C    0    1    0    0    1
4 D    1    0    0    1    0
5 E    0    1    1    0    0

实现这种转换的有效方法是什么?

我现在的做法是分别为每一列应用 dcast 多次。并相应地重命名列。

f <- function(x) as.integer(length(x) > 0)
mydf2 =
  mydf %>%
  dcast(... ~ V1, fun.aggregate = f, value.var = "V1") %>%
  dcast(... ~ V2, fun.aggregate = f, value.var = "V2") 
  
colnames(mydf2) = c("A", "V1_x", "V1_y", "V2_u",  "V2_v",  "V2_w")

如果我们在循环中运行它,这对于许多变量来说似乎并不有效。此外,如果因子的数量因变量而异,则在每一步重命名变量都不是很可靠。

相同解决方案的循环版本:


mydf2 = mydf
for(tmp_var in  c("V1","V2")){
  ncol_before = ncol(mydf2)
  mydf2 =
    mydf2 %>%
    dcast( as.formula( sprintf("... ~ %s", tmp_var)  ),
          fun.aggregate = f, value.var = tmp_var)
  ncol_after = ncol(mydf2)
  # assign correct names to created vars
  colnames(mydf2)[ncol_before:ncol_after ] = paste0(tmp_var, "_", colnames(mydf2)[ncol_before:ncol_after ])
}
mydf2

【问题讨论】:

    标签: r data.table dcast


    【解决方案1】:

    这是cSplit_e的选项

    library(splitstackshape)
    cSplit_e(mydf, 'V1', type = 'character', fill = '0') %>%
          cSplit_e('V2', type = 'character', fill = '0')
    #   A V1 V2 V1_x V1_y V2_u V2_v V2_w
    #1: A  x  u    1    0    1    0    0
    #2: B  x  v    1    0    0    1    0
    #3: C  y  w    0    1    0    0    1
    #4: D  x  v    1    0    0    1    0
    #5: E  y  u    0    1    1    0    0
    

    或与table 来自base R

     do.call(cbind, lapply(2:3, function(i) table(mydf$A, mydf[[i]])))
    

    data.table语法中的相同方法

    nm1 <- names(mydf)[-1]
    out <- mydf[,  lapply(.SD, function(x) 
             as.data.frame.matrix(table(A, x))), .SDcols = nm1]
    mydf[, names(out) := out][]
    #   A V1 V2 V1.x V1.y V2.u V2.v V2.w
    #1: A  x  u    1    0    1    0    0
    #2: B  x  v    1    0    0    1    0
    #3: C  y  w    0    1    0    0    1
    #4: D  x  v    1    0    0    1    0
    #5: E  y  u    0    1    1    0    0
    

    【讨论】:

      【解决方案2】:

      非常感谢 akrun 提供的解决方案。它比我以前见过的任何东西都高效。

      对性能的快速跟进。定义包含 200 列和 100,000 个观测值的数据集。

      N = 1e5
      mydf <- data.table(A =  1:N)
      for(j in 1:100)
        mydf[[paste0("V",j)]] = sample(c("x", "y", "z"),N, replace = T)
      for(j in 101:200)
        mydf[[paste0("V",j)]] = sample(c("v", "u", "w"),N, replace = T)
      

      那么data.table方式的表现:

      start_time <- Sys.time()
      nm1 <- names(mydf)[-1]
      out <- mydf[,  lapply(.SD, function(x) as.data.frame.matrix(table(A, x))), .SDcols = nm1]
      end_time <- Sys.time()
      end_time - start_time
      

      Time difference of 16.39327 secs

      out[1:10,1:10]
          V1.x V1.y V1.z V2.x V2.y V2.z V3.x V3.y V3.z V4.x
       1:    1    0    0    1    0    0    0    0    1    1
       2:    0    0    1    1    0    0    1    0    0    0
       3:    0    1    0    0    1    0    1    0    0    0
       4:    0    0    1    0    1    0    1    0    0    0
       5:    0    0    1    0    1    0    0    0    1    1
       6:    0    1    0    0    0    1    0    0    1    0
       7:    1    0    0    1    0    0    0    1    0    1
       8:    0    1    0    1    0    0    0    1    0    1
       9:    0    0    1    1    0    0    1    0    0    0
      10:    1    0    0    0    1    0    0    1    0    0
      
      

      do.call 的性能基本相同,但我们还是需要重命名列:

      start_time <- Sys.time()
      out <- do.call(cbind, lapply(2:201, function(i) table(mydf$A, mydf[[i]]) ))
      end_time <- Sys.time()
      end_time - start_time
      

      Time difference of 14.72284 secs

      out[1:10,1:10]
      
         x y z x y z x y z x
      1  1 0 0 1 0 0 0 0 1 1
      2  0 0 1 1 0 0 1 0 0 0
      3  0 1 0 0 1 0 1 0 0 0
      4  0 0 1 0 1 0 1 0 0 0
      5  0 0 1 0 1 0 0 0 1 1
      6  0 1 0 0 0 1 0 0 1 0
      7  1 0 0 1 0 0 0 1 0 1
      8  0 1 0 1 0 0 0 1 0 1
      9  0 0 1 1 0 0 1 0 0 0
      10 1 0 0 0 1 0 0 1 0 0
      
      

      编辑

      Ronak Shah 的替代 tidyverse 解决方案在数据上的效率更高:

      start_time <- Sys.time()
      mydf2 =
        mydf %>%
        pivot_longer(cols = starts_with('V')) %>%
        unite(name, name, value) %>%
        mutate(value = 1) %>%
        pivot_wider(values_fill = 0)
      end_time <- Sys.time()
      end_time - start_time
      

      Time difference of 9.892609 secs

      # A tibble: 100,000 x 601
             A  V1_z  V2_y  V3_y  V4_y  V5_z  V6_y  V7_y  V8_y  V9_x V10_z V11_x V12_y
         <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
       1     1     1     1     1     1     1     1     1     1     1     1     1     1
       2     2     0     0     0     0     0     0     0     0     0     1     0     1
       3     3     1     0     0     0     0     0     0     0     0     0     0     0
       4     4     1     1     0     0     0     1     0     0     0     0     0     0
       5     5     1     0     0     0     0     0     0     1     1     0     0     1
       6     6     1     1     0     1     0     0     0     0     1     0     0     0
       7     7     0     0     0     1     0     1     0     1     1     0     0     0
       8     8     0     1     1     0     1     1     1     0     1     1     0     0
       9     9     0     0     0     1     1     1     1     1     0     1     1     0
      10    10     0     1     0     0     1     0     0     0     0     0     1     0
      # … with 99,990 more rows, and 588 more variables: V13_x <dbl>, V14_x <dbl>,
      #   V15_z <dbl>, V16_z <dbl>, V17_y <dbl>, V18_y <dbl>, V19_z <dbl>,
      #   V20_x <dbl>, V21_y <dbl>, V22_z <dbl>, V23_z <dbl>, V24_x <dbl>,
      #   V25_y <dbl>, V26_z <dbl>, V27_z <dbl>, V28_x <dbl>, V29_z <dbl>,
      #   V30_y <dbl>, V31_x <dbl>, V32_y <dbl>, V33_x <dbl>, V34_y <dbl>,
      #   V35_y <dbl>, V36_y <dbl>, V37_x <dbl>, V38_y <dbl>, V39_y <dbl>,
      #   V40_x <dbl>, V41_y <dbl>, V42_z <dbl>, V43_z <dbl>, V44_y <dbl>,
      #   V45_x <dbl>, V46_x <dbl>, V47_x <dbl>, V48_y <dbl>, V49_y <dbl>,
      #   V50_z <dbl>, V51_z <dbl>, V52_z <dbl>, V53_z <dbl>, V54_y <dbl>,
      #   V55_y <dbl>, V56_y <dbl>, V57_x <dbl>, V58_x <dbl>, V59_x <dbl>,
      #   V60_z <dbl>, V61_y <dbl>, V62_x <dbl>, V63_y <dbl>, V64_y <dbl>,
      #   V65_y <dbl>, V66_x <dbl>, V67_x <dbl>, V68_x <dbl>, V69_y <dbl>,
      #   V70_x <dbl>, V71_y <dbl>, V72_y <dbl>, V73_x <dbl>, V74_y <dbl>,
      #   V75_x <dbl>, V76_z <dbl>, V77_x <dbl>, V78_y <dbl>, V79_z <dbl>,
      #   V80_z <dbl>, V81_x <dbl>, V82_z <dbl>, V83_x <dbl>, V84_x <dbl>,
      #   V85_z <dbl>, V86_z <dbl>, V87_x <dbl>, V88_z <dbl>, V89_x <dbl>,
      #   V90_y <dbl>, V91_y <dbl>, V92_z <dbl>, V93_z <dbl>, V94_z <dbl>,
      #   V95_z <dbl>, V96_z <dbl>, V97_z <dbl>, V98_y <dbl>, V99_z <dbl>,
      #   V100_y <dbl>, V101_v <dbl>, V102_w <dbl>, V103_u <dbl>, V104_w <dbl>,
      #   V105_v <dbl>, V106_v <dbl>, V107_u <dbl>, V108_w <dbl>, V109_v <dbl>,
      #   V110_v <dbl>, V111_u <dbl>, V112_v <dbl>, …
      

      【讨论】:

        【解决方案3】:

        您可以获取长格式数据,组合列名和列值,创建一个虚拟列并获取宽格式数据。

        使用tidyverse 可以这样做:

        library(dplyr)
        library(tidyr)
        
        mydf %>%
          pivot_longer(cols = starts_with('V')) %>%
          unite(name, name, value) %>%
          mutate(value = 1) %>%
          pivot_wider(values_fill = 0)
        
        # A tibble: 5 x 6
        #   A      V1_x  V2_u  V2_v  V1_y  V2_w
        #  <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
        #1 A         1     1     0     0     0
        #2 B         1     0     1     0     0
        #3 C         0     0     0     1     1
        #4 D         1     0     1     0     0
        #5 E         0     1     0     1     0
        

        如果您想按照相同的逻辑在data.table 中执行此操作,您可以尝试:

        library(data.table)
        df1 <- melt(setDT(mydf), id.vars = 'A')
        df1[, c('variable', 'value') := .(paste(variable, value, sep = "_"), 1)]
        dcast(df1, A~variable, value.var = 'value', fill = 0)
        

        【讨论】:

        • 也是一个非常优雅的解决方案。非常感谢。实际上,大数据的性能甚至比使用 data.table 方法之前报告的更好Time difference of 9.892609 secs
        猜你喜欢
        • 2020-05-11
        • 1970-01-01
        • 2021-11-03
        • 2018-05-27
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2021-11-16
        相关资源
        最近更新 更多