【问题标题】:Using tidyverse/dplyr to reassign unobserved values using histogram of observed values使用 tidyverse/dplyr 使用观测值的直方图重新分配未观测值
【发布时间】:2017-12-01 10:03:02
【问题描述】:

在野外电钓作业期间对鳗鱼进行了采样。大多数被测量(批次=S), 有些不是(批次=L)。 我想使用最接近的 10 毫米值和 在单个“S”测量中观察到的尺寸结构。

eel <- structure(list(op = c(529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L), size = c(101L, 103L, 110L, 112L, 115L, 119L, 120L, 
            121L, 121L, 121L, 123L, 127L, 128L, 129L, 135L, 140L, 146L, 147L, 
            147L, 148L, 150L, 152L, 152L, 155L, 159L, 160L, 164L, 164L, 164L, 
            175L, 180L, 184L, 190L, 192L, 193L, 213L, 216L, 227L, 233L, 235L, 
            240L, 253L, 256L, 278L, 287L, 289L, 303L, 307L, 312L, 323L, 80L, 
            82L, 92L, 93L, 100L, 112L, 114L, 120L, 121L, 122L, 128L, 131L, 
            147L, 149L, 151L, 156L, 159L, 161L, 164L, 165L, 167L, 168L, 172L, 
            195L, 222L, 228L, 242L, 257L, 265L, 265L, 275L, 290L, 294L, 294L, 
            307L, 310L, 315L, 330L, 374L, 80L, 143L, 176L, 165L, 141L, 139L, 
            93L, 138L, 129L, 143L, 139L, 126L, 84L, 126L, 119L, 129L, 111L, 
            112L, 426L, 188L, 186L, 293L, 235L, 188L, 173L, 177L, 176L, 165L, 
            165L, 166L, 141L, 231L, 168L, 167L, 186L, 168L, 161L, 187L, 129L, 
            155L, 84L), batch = c("S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "L", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "L"), number = c(0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 133L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 67L)), .Names = c("op", 
        "size", "batch", "number"), row.names = 4:133, class = "data.frame")

我尝试使用直方图对 tidyverse 进行此操作,我使用从我的尺寸结构中提取数据 以下函数(这将确保所有观察到的值都在中断范围内)。我希望每 10 毫米放置一次新尺寸。

fn<-function(x) hist(x,
      breaks=seq(min(plyr::round_any(x, 10,f=floor)),plyr::round_any(max(x),10,f=ceiling),by=10),
      plot=FALSE)

然后我应用以下代码

hist <- eel%>% 
    filter(batch=='S') %>%
    select (size,op) %>% 
    group_by(op)  %>%
    by_slice(~fn(.x$size))  

在这里,我在 .out 列中有一个直方图,带有 breakscounts,并使用我想要的那些 在我的数据框中创建新行。任何帮助将不胜感激。

【问题讨论】:

    标签: r dplyr tidyverse


    【解决方案1】:

    我找到了一种方法,可能不是最好的方法,我使用了browser 参数来确定细节。困难的部分之一是重新分配的数字必须是整数,当根据每个班级规模的百分比对数字进行四舍五入时,会添加或丢失一些计数。所以我不得不将一些鳗鱼随机重新分配到尺寸结构中。做机器公差,参数sample(1:nrow(df),rr)不起作用,我不得不舍入rr。请注意,我尝试使用函数 mapmap2 并没有管理它,所以任何其他更简单的方法都会非常受欢迎。

    group_sample <-    
        eel%>%     
        filter(batch=='L')%>%
        select (op,number)
    
    
    individual_sample <- 
        eel%>% 
        filter(batch=='S') %>%
        select (size,op)%>%
        group_by(op)  %>%   
        by_slice(~fn(.x$size)) %>%
        rename(hist=.out)
    
    reassigned_sample<- inner_join(individual_sample,group_sample,by=c("op"))  %>%
        by_row(..f=function(this_row){
              #browser()
              # frequencies
              vec <-this_row["hist"][[1]][[1]]$counts/sum(this_row["hist"][[1]][[1]]$counts)*pull(this_row["number"])
              # numbers are rounded, but there is a problem with sum
              roundvec <- round(vec)
              sumvec <- sum(vec)
              sumroundvec <- sum(roundvec)
              # difference between rounded numbers and numbers
              rr <- sumroundvec-sumvec
              # creation du jeu de données ressemblant au tableau de départ (moins id première colonne)
              df <- data.frame("op"=pull(this_row["op"]),
                  "size"=this_row["hist"][[1]][[1]]$mids-5,
                  "batch"="SL",
                  "number"=roundvec
                 )  
              # remove lines with 0 number
              df<-df[df$number>0,]
              if (rr >0) {
                # randomly removing eels from some samples
                # round(rr) necessary otherwise might not be exact integer
                sss <- sample(1:nrow(df),round(rr))
                df[sss,"number"]<-df[sss,"number"]-1
                # randomly adding eels for some samples
              } else if (rr <0){            
                sss<-sample(1:nrow(df),round(-rr))
                df[sss,"number"]<-df[sss,"number"]+1
              } else {
                # do nothing
              }
              stopifnot(round(sum(df$number))==round(sumvec))
              return(df)          
            }) %>%
        rename(table=.out)
    
    bind_rows(reassigned_sample$table)
    
    
    op size batch number
    1  529   80    SL      3
    2  529   90    SL      4
    3  529  100    SL      4
    

    【讨论】:

      猜你喜欢
      • 2015-05-05
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2016-10-15
      • 1970-01-01
      • 2019-01-01
      • 2021-01-29
      相关资源
      最近更新 更多