使用 tidyverse/dplyr 使用观测值的直方图重新分配未观测值答案

【问题标题】：Using tidyverse/dplyr to reassign unobserved values using histogram of observed values使用 tidyverse/dplyr 使用观测值的直方图重新分配未观测值
【发布时间】：2017-12-01 10:03:02
【问题描述】：

在野外电钓作业期间对鳗鱼进行了采样。大多数被测量（批次=S），有些不是（批次=L）。我想使用最接近的 10 毫米值和在单个“S”测量中观察到的尺寸结构。

eel <- structure(list(op = c(529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L), size = c(101L, 103L, 110L, 112L, 115L, 119L, 120L, 
            121L, 121L, 121L, 123L, 127L, 128L, 129L, 135L, 140L, 146L, 147L, 
            147L, 148L, 150L, 152L, 152L, 155L, 159L, 160L, 164L, 164L, 164L, 
            175L, 180L, 184L, 190L, 192L, 193L, 213L, 216L, 227L, 233L, 235L, 
            240L, 253L, 256L, 278L, 287L, 289L, 303L, 307L, 312L, 323L, 80L, 
            82L, 92L, 93L, 100L, 112L, 114L, 120L, 121L, 122L, 128L, 131L, 
            147L, 149L, 151L, 156L, 159L, 161L, 164L, 165L, 167L, 168L, 172L, 
            195L, 222L, 228L, 242L, 257L, 265L, 265L, 275L, 290L, 294L, 294L, 
            307L, 310L, 315L, 330L, 374L, 80L, 143L, 176L, 165L, 141L, 139L, 
            93L, 138L, 129L, 143L, 139L, 126L, 84L, 126L, 119L, 129L, 111L, 
            112L, 426L, 188L, 186L, 293L, 235L, 188L, 173L, 177L, 176L, 165L, 
            165L, 166L, 141L, 231L, 168L, 167L, 186L, 168L, 161L, 187L, 129L, 
            155L, 84L), batch = c("S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "L", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "L"), number = c(0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 133L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 67L)), .Names = c("op", 
        "size", "batch", "number"), row.names = 4:133, class = "data.frame")

我尝试使用直方图对 tidyverse 进行此操作，我使用从我的尺寸结构中提取数据以下函数（这将确保所有观察到的值都在中断范围内）。我希望每 10 毫米放置一次新尺寸。

fn<-function(x) hist(x,
      breaks=seq(min(plyr::round_any(x, 10,f=floor)),plyr::round_any(max(x),10,f=ceiling),by=10),
      plot=FALSE)

然后我应用以下代码

hist <- eel%>% 
    filter(batch=='S') %>%
    select (size,op) %>% 
    group_by(op)  %>%
    by_slice(~fn(.x$size))

在这里，我在 .out 列中有一个直方图，带有 breaks 和 counts，并使用我想要的那些在我的数据框中创建新行。任何帮助将不胜感激。

【问题讨论】：

标签： r dplyr tidyverse

【解决方案1】：

我找到了一种方法，可能不是最好的方法，我使用了browser 参数来确定细节。困难的部分之一是重新分配的数字必须是整数，当根据每个班级规模的百分比对数字进行四舍五入时，会添加或丢失一些计数。所以我不得不将一些鳗鱼随机重新分配到尺寸结构中。做机器公差，参数sample(1:nrow(df),rr)不起作用，我不得不舍入rr。请注意，我尝试使用函数 map 和 map2 并没有管理它，所以任何其他更简单的方法都会非常受欢迎。

group_sample <-    
    eel%>%     
    filter(batch=='L')%>%
    select (op,number)


individual_sample <- 
    eel%>% 
    filter(batch=='S') %>%
    select (size,op)%>%
    group_by(op)  %>%   
    by_slice(~fn(.x$size)) %>%
    rename(hist=.out)

reassigned_sample<- inner_join(individual_sample,group_sample,by=c("op"))  %>%
    by_row(..f=function(this_row){
          #browser()
          # frequencies
          vec <-this_row["hist"][[1]][[1]]$counts/sum(this_row["hist"][[1]][[1]]$counts)*pull(this_row["number"])
          # numbers are rounded, but there is a problem with sum
          roundvec <- round(vec)
          sumvec <- sum(vec)
          sumroundvec <- sum(roundvec)
          # difference between rounded numbers and numbers
          rr <- sumroundvec-sumvec
          # creation du jeu de données ressemblant au tableau de départ (moins id première colonne)
          df <- data.frame("op"=pull(this_row["op"]),
              "size"=this_row["hist"][[1]][[1]]$mids-5,
              "batch"="SL",
              "number"=roundvec
             )  
          # remove lines with 0 number
          df<-df[df$number>0,]
          if (rr >0) {
            # randomly removing eels from some samples
            # round(rr) necessary otherwise might not be exact integer
            sss <- sample(1:nrow(df),round(rr))
            df[sss,"number"]<-df[sss,"number"]-1
            # randomly adding eels for some samples
          } else if (rr <0){            
            sss<-sample(1:nrow(df),round(-rr))
            df[sss,"number"]<-df[sss,"number"]+1
          } else {
            # do nothing
          }
          stopifnot(round(sum(df$number))==round(sumvec))
          return(df)          
        }) %>%
    rename(table=.out)

bind_rows(reassigned_sample$table)


op size batch number
1  529   80    SL      3
2  529   90    SL      4
3  529  100    SL      4

【讨论】：