【问题标题】:Efficient functions over specific data.frame columns in a list of data.frames对 data.frames 列表中特定 data.frame 列的高效函数
【发布时间】:2015-06-26 17:03:24
【问题描述】:

我有一个data.frames 的列表。例如

set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
  my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}

我想要的是有效地计算列表中所有数据帧的“p”、“m”、“d”和“a”列的每个 id(在“id”列中)的方差。理想情况下,这将返回一个像这样的data.frame(基于上面绘制的值):

> result.df
  id     var_p     var_m      var_d     var_a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

【问题讨论】:

    标签: r list dataframe


    【解决方案1】:

    使用my_list

    library(plyr)
    df = do.call(rbind, my_list)
    out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
    
    #> out
    #  id         p         m          d         a
    #1  a 0.2371569 1.7810729 0.08264279 0.5074250
    #2  b 0.1091675 0.2107997 1.15051229 1.1578691
    #3  c 0.5385789 0.7650123 0.44215343 0.3137903
    #4  d 1.0174542 0.7818498 0.06414317 0.6079849
    #5  e 0.7343667 1.2870542 1.41615858 0.7362462
    

    或base R替代,使用lapplyapply的组合

    df = do.call(rbind, my_list)
    df1 = do.call(rbind, 
          lapply(split(df, df$id), 
          function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
    
    out = transform(df1, id = row.names(df1))
    
    #> out
    #          p         m          d         a id
    #a 0.2371569 1.7810729 0.08264279 0.5074250  a
    #b 0.1091675 0.2107997 1.15051229 1.1578691  b
    #c 0.5385789 0.7650123 0.44215343 0.3137903  c
    #d 1.0174542 0.7818498 0.06414317 0.6079849  d
    #e 0.7343667 1.2870542 1.41615858 0.7362462  e
    

    或使用doBy

    library(doBy)
    df = do.call(rbind, my_list)
    out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
    
    #> out
    #  id         p         m          d         a
    #1  a 0.2371569 1.7810729 0.08264279 0.5074250
    #2  b 0.1091675 0.2107997 1.15051229 1.1578691
    #3  c 0.5385789 0.7650123 0.44215343 0.3137903
    #4  d 1.0174542 0.7818498 0.06414317 0.6079849
    #5  e 0.7343667 1.2870542 1.41615858 0.7362462
    

    或使用sqldf

    library(sqldf)
    df = do.call(rbind, my_list)
    out = sqldf("select id, variance(p), variance(m), 
                 variance(d), variance(a) from df group by id")
    
    #> out
    #  id variance(p) variance(m) variance(d) variance(a)
    #1  a   0.2371569   1.7810729  0.08264279   0.5074250
    #2  b   0.1091675   0.2107997  1.15051229   1.1578691
    #3  c   0.5385789   0.7650123  0.44215343   0.3137903
    #4  d   1.0174542   0.7818498  0.06414317   0.6079849
    #5  e   0.7343667   1.2870542  1.41615858   0.7362462
    

    【讨论】:

      【解决方案2】:

      这是一个基本的 R 方法

      dat <- do.call(rbind,my_list)
      aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
      

      给了

        id         p         m          d         a
      1  a 0.2371569 1.7810729 0.08264279 0.5074250
      2  b 0.1091675 0.2107997 1.15051229 1.1578691
      3  c 0.5385789 0.7650123 0.44215343 0.3137903
      4  d 1.0174542 0.7818498 0.06414317 0.6079849
      5  e 0.7343667 1.2870542 1.41615858 0.7362462
      

      【讨论】:

        【解决方案3】:
        library(data.table)
        rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
        #    id         p         m          d         a
        # 1:  a 0.2371569 1.7810729 0.08264279 0.5074250
        # 2:  b 0.1091675 0.2107997 1.15051229 1.1578691
        # 3:  c 0.5385789 0.7650123 0.44215343 0.3137903
        # 4:  d 1.0174542 0.7818498 0.06414317 0.6079849
        # 5:  e 0.7343667 1.2870542 1.41615858 0.7362462
        

        【讨论】:

        • 您可以链接结果以避免中间分配.. (+1).
        【解决方案4】:

        更新为使用bind_rows()(根据@hadley 的建议比do.call(rbind,...) 更有效)

        library(dplyr)
        dat <- bind_rows(dat)[,c("id","p","m","d","a")]
        dat %>% group_by(id) %>% summarise_each(funs(var))
        
        #   id         p         m          d         a
        # 1  a 0.2371569 1.7810729 0.08264279 0.5074250
        # 2  b 0.1091675 0.2107997 1.15051229 1.1578691
        # 3  c 0.5385789 0.7650123 0.44215343 0.3137903
        # 4  d 1.0174542 0.7818498 0.06414317 0.6079849
        # 5  e 0.7343667 1.2870542 1.41615858 0.7362462
        

        【讨论】:

        • bind_rows() 会更高效
        猜你喜欢
        • 2020-06-09
        • 2023-03-28
        • 1970-01-01
        • 1970-01-01
        • 2021-08-10
        • 2019-12-18
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多