【问题标题】:Apply a function to dataframe subsetted by all possible combinations of categorical variables将函数应用于由分类变量的所有可能组合子集的数据框
【发布时间】:2013-05-29 21:53:53
【问题描述】:

具有分类变量 catA、catB 和 catC 的示例数据框。 Obs 是一些观察值。

catA <- rep(factor(c("a","b","c")), length.out=100)
catB <- rep(factor(1:4), length.out=100)
catC <- rep(factor(c("d","e","f")), length.out=100)
obs <- runif(100,0,100)
dat <- data.frame(catA, catB, catC, obs)

分类变量的所有可能的数据子集。

allsubs <- expand.grid(catA = c(NA,levels(catA)), catB = c(NA,levels(catB)),
    catC = c(NA,levels(catC)))
> head(allsubs, n=10)
   catA catB catC
 1  <NA> <NA> <NA>
 2     a <NA> <NA>
 3     b <NA> <NA>
 4     c <NA> <NA>
 5  <NA>    1 <NA>
 6     a    1 <NA>
 7     b    1 <NA>
 8     c    1 <NA>
 9  <NA>    2 <NA>
 10    a    2 <NA>

现在,创建输出数据框的最简单方法是什么,其结果列包含来自应用于 dat 的相应子集(在每一行中由 cat 变量的组合定义)的函数的结果。因此输出应类似于以下数据框“whatiwant”,其中结果列将包含应用于每个子集的函数的结果。

> whatiwant
    catA catB catC results
 1  <NA> <NA> <NA>       *
 2     a <NA> <NA>       *
 3     b <NA> <NA>       *
 4     c <NA> <NA>       *
 5  <NA>    1 <NA>       *
 6     a    1 <NA>       *
 7     b    1 <NA>       *
 8     c    1 <NA>       *
 9  <NA>    2 <NA>       *
 10    a    2 <NA>       *

所以,如果应用的函数是“平均”,结果应该是:

dat$results[1] = mean(subset(dat,)$obs)
dat$results[2] = mean(subset(dat, catA=="a")$obs)

等等等等。

【问题讨论】:

    标签: r


    【解决方案1】:
    ans <- with(dat, tapply(obs, list(catA, catB, catC), mean))
    ans <- data.frame(expand.grid(dimnames(ans)), results=c(ans))
    names(ans)[1:3] <- names(dat)[1:3]
    
    str(ans)
    # 'data.frame':  36 obs. of  4 variables:
    #  $ catA   : Factor w/ 3 levels "a","b","c": 1 2 3 1 2 3 1 2 3 1 ...
    #  $ catB   : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 2 2 3 3 3 4 ...
    #  $ catC   : Factor w/ 3 levels "d","e","f": 1 1 1 1 1 1 1 1 1 1 ...
    #  $ results: num  69.7 NA NA 55.3 NA ...
    

    【讨论】:

      【解决方案2】:

      另一种方法,一个函数获取变量的所有组合,另一个函数将函数应用于所有子集。组合功能是从另一个帖子偷来的...

      ## return all combinations of vector up to maximum length n
      multicombn <- function(dat, n) {
          unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
      }
      

      对于 allsubs,vars 的格式为 c("catA","catB","catC"), out.name = "mean". func 需要以 ddply 的形式编写,

      func=function(x) mean(x$obs, na.rm=TRUE)
      
      library(plyr)
      allsubs <- function(indat, vars, func=NULL, out.name=NULL) {
          results <- data.frame()
          nvars <- rev(multicombn(vars,length(vars)))
          for(i in 1:length(nvars)) {
              results <-
                  rbind.fill(results, ddply(indat, unlist(nvars[i]), func))
          }
          if(!missing(out.name)) names(results)[length(vars)+1] <- out.name
          results
      }
      

      这个答案和 shwaund 的一个区别,这不会返回空行 子集,因此结果列中没有 NA。

      allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
      > head(allsubs(dat, vars, func, out.name = "mean"),20)
         catA catB catC     mean
      1     a    1    d 56.65909
      2     a    2    d 54.98116
      3     a    3    d 37.52655
      4     a    4    d 58.29034
      5     b    1    e 52.88945
      6     b    2    e 50.43122
      7     b    3    e 52.57115
      8     b    4    e 59.45348
      9     c    1    f 52.41637
      10    c    2    f 34.58122
      11    c    3    f 46.80256
      12    c    4    f 51.58668
      13 <NA>    1    d 56.65909
      14 <NA>    1    e 52.88945
      15 <NA>    1    f 52.41637
      16 <NA>    2    d 54.98116
      17 <NA>    2    e 50.43122
      18 <NA>    2    f 34.58122
      19 <NA>    3    d 37.52655
      20 <NA>    3    e 52.57115
      

      【讨论】:

        【解决方案3】:

        这不是最干净的解决方案,但我认为它接近你想要的。

        getAllSubs <- function(df, lookup, fun) {
        
          out <- lapply(1:nrow(lookup), function(i) {
        
            df_new <- df
        
            if(length(na.omit(unlist(lookup[i,]))) > 0) {
        
              for(j in colnames(lookup)[which(!is.na(unlist(lookup[i,])))]) {
                df_new <- df_new[df_new[,j] == lookup[i,j],]
              }  
            } 
            fun(df_new)  
          })
        
          if(mean(sapply(out, length) ==1) == 1) {
            out <- unlist(out)
          } else {
            out <- do.call("rbind", out)
          }
        
          final <- cbind(lookup, out)
          final[is.na(final)] <- NA
          final
        }
        

        正如当前编写的那样,您必须事先构造查找表,但您可以轻松地将构造移至函数本身。我在最后添加了几行以确保它可以容纳不同长度的输出,因此将 NaN 转换为 NA,只是因为这似乎可以创建更清晰的输出。按照目前的编写方式,在所有列均为 NA 的情况下,它将函数应用于整个原始数据帧。

        dat_out <- getAllSubs(dat, allsubs, function(x) mean(x$obs, na.rm = TRUE))
        
        head(dat_out,20)
        
           catA catB catC      out
        1  <NA> <NA> <NA> 47.25446
        2     a <NA> <NA> 51.54226
        3     b <NA> <NA> 46.45352
        4     c <NA> <NA> 43.63767
        5  <NA>    1 <NA> 47.23872
        6     a    1 <NA> 66.59281
        7     b    1 <NA> 32.03513
        8     c    1 <NA> 40.66896
        9  <NA>    2 <NA> 45.16588
        10    a    2 <NA> 50.59323
        11    b    2 <NA> 51.02013
        12    c    2 <NA> 33.15251
        13 <NA>    3 <NA> 51.67809
        14    a    3 <NA> 48.13645
        15    b    3 <NA> 57.92084
        16    c    3 <NA> 49.27710
        17 <NA>    4 <NA> 44.93515
        18    a    4 <NA> 40.36266
        19    b    4 <NA> 44.26717
        20    c    4 <NA> 50.74718
        

        【讨论】:

          【解决方案4】:

          仅使用向量化函数和基 R

          # Find all possible subsets of your data
          combVars <- c("catA", "catB", "catC")
          subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
          subsets <- do.call(c, subsets)
          # Calculate means by each subset
          meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
          # Pull them all into one dataframe
          Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)
          

          【讨论】:

          • 很好的答案。我已经为 data.tables here 调整了它。
          猜你喜欢
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 2012-10-16
          • 2014-04-29
          • 1970-01-01
          • 2020-02-24
          相关资源
          最近更新 更多