【问题标题】:replace loop with an *pply alternative用 *pply 替代替换循环
【发布时间】:2014-03-09 21:34:04
【问题描述】:

我试图通过用 tapply (How to do vlookup and fill down (like in Excel) in R?) 替换一些查找循环来加速我的代码,我偶然发现了这段代码:

DF<-data.frame(id=c(rep("A", 5),rep("B", 7),rep("C", 9)), series=NA, chi=c(letters[1:5], LETTERS[6:12], letters[13:21]))
for (i in unique(DF$id)){
  DF$series[ DF$id==i ]<-1:length(DF$id[ DF$id==i ])
}
DF

是否可以用*apply 系列函数替换它?或者有什么其他方法可以加快速度?

【问题讨论】:

    标签: r for-loop


    【解决方案1】:

    你可以试试ave:

    DF$series <- ave(DF$id, DF$id, FUN = seq_along)
    

    但对于更大的数据集,dplyr 更快。

    library(dplyr)
    
    fun_ave <- function(df) transform(df, series = ave(id, id, FUN = seq_along))
    
    fun_dp <- function(df) df %.%
                     group_by(id) %.%
                     mutate(
                       series = seq_along(id))
    
    df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE))
    
    microbenchmark(fun_ave(df))
    # Unit: milliseconds
    #        expr      min       lq   median      uq      max neval
    # fun_ave(df) 38.59112 39.40802 50.77921 51.2844 128.6791   100
    
    
    microbenchmark(fun_dp(df))
    # Unit: milliseconds
    #       expr      min       lq   median       uq      max neval
    # fun_dp(df) 4.977035 5.034244 5.060663 5.265173 17.16018   100
    

    【讨论】:

      【解决方案2】:

      也可以使用data.table

      library(data.table)
      DT <- data.table(DF)
      DT[, series_new := 1:.N, by = id]
      

      并使用tapply

      DF$series_new  <- unlist(tapply(DF$id, DF$id, function(x) 1:length(x)))
      

      data.tabledplyr 之上扩展@Henrik 的比较对于大型数据集来说要快得多。

      library(data.table)
      library(dplyr)
      
      df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE), stringsAsFactors = F)
      dt <- data.table(df)
      
      fun_orig <- function(df){
        for (i in unique(df$id)){
          df$series[df$id==i]<-1:length(df$id[df$id==i])
        }}
      
      fun_tapply  <- function(df){
        df$series <- unlist(tapply(df$id, df$id, function(x) 1:length(x)))
      }
      
      fun_ave <- function(df){
        transform(df, series = ave(df$id, df$id, FUN = seq_along))
      }
      
      fun_dp <- function(df){
        df %.%
        group_by(id) %.%
        mutate(
          series = seq_along(id))
      }
      
      fun_dt <- function(dt) dt[, 1:.N, by = id] 
      
      microbenchmark(fun_dt(dt), times = 1000)
      #Unit: milliseconds
      #       expr      min       lq   median      uq      max neval
      # fun_dt(dt) 2.473253 2.597031 2.771771 3.76307 40.59909  1000
      
      microbenchmark(fun_dp(df), times = 1000)
      #Unit: milliseconds
      #       expr     min       lq   median       uq      max neval
      # fun_dp(df) 2.71375 2.786829 2.914569 3.081609 40.48445  1000
      
      microbenchmark(fun_orig(df), times = 1000)
      #Unit: milliseconds
      #         expr      min       lq   median       uq      max neval
      # fun_orig(df) 30.65534 31.93449 32.72991 33.88885 75.13967  1000
      
      microbenchmark(fun_tapply(df), times = 1000)
      #Unit: milliseconds
      #           expr      min       lq   median       uq      max neval
      # fun_tapply(df) 56.67636 61.72207 66.37193 102.4189 124.6661  1000
      
      microbenchmark(fun_ave(df), times = 1000)
      #Unit: milliseconds
      #        expr      min      lq   median       uq      max neval
      # fun_ave(df) 97.36992 103.161 107.5007 139.1362 157.9464  1000
      

      【讨论】:

        猜你喜欢
        • 2023-01-13
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2019-04-04
        • 2018-08-20
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多