【问题标题】:counting after and before change in value, within groups, generating new variables for each unique shift在组内的值变化前后计数,为每个独特的班次生成新变量
【发布时间】:2018-03-14 11:43:27
【问题描述】:

我正在计算我的组中唯一值的出现次数,id。我在看TF。当TF 发生变化时,我想从那一点向前和向后计数。此计数应存储在新变量PM# 中,以便PM# 保存TF 中每个唯一移位的正负。根据我收集到的信息,我需要使用rle,但我有点卡住了。

我制作了这个工作示例来说明我的问题。

我有这个数据

df <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L, 
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L, 
0L, 1L, 1L, 1L)), .Names = c("id", "TF"), class = "data.frame", row.names = c(NA, 
-30L))

这是我看到的那种数据

df[c(1:12,19:30),]
#>    id TF
#> 1   0 NA
#> 2   0  0
#> 3   0 NA
#> 4   0  0
#> 5   0  0
#> 6   0  1
#> 7   0  1
#> 8   0  1
#> 9   0 NA
#> 10  0  0
#> 11  0  0
#> 12  1 NA
#> 19  1 NA
#> 20  7 NA
#> 21  7  0
#> 22  7  0
#> 23  7  1
#> 24  7  0
#> 25  7  0
#> 26  7  1
#> 27  7  0
#> 28  7  1
#> 29  7  1
#> 30  7  1

我已经开始干预avecumsumrle,但还没有通过这种方式解决。

df$PM01 <- with(df, ifelse(is.na(TF), NA, 1))
df$PM01 <- with(df, ave(PM01, TF, id, FUN=cumsum))

with(df, tapply(TF, rep(rle(id)[[2]], rle(id)[[1]]), count))

这是我想要得到的,

dfa <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L, 
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L, 
0L, 1L, 1L, 1L), PM1 = c(NA, -3L, NA, -2L, -1L, 1L, 2L, 3L, NA, 
NA, NA, NA, -3L, -2L, -1L, 1L, 2L, 3L, NA, NA, -2L, -1L, 1L, 
NA, NA, NA, NA, NA, NA, NA), PM2 = c(NA, NA, NA, NA, NA, -3L, 
-2L, -1L, NA, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, -1L, 1L, 2L, NA, NA, NA, NA, NA), PM3 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, -2L, -1L, 1L, NA, NA, NA, NA), PM4 = c(NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA), PM5 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, 2L, 3L)), .Names = c("id", 
"TF", "PM1", "PM2", "PM3", "PM4", "PM5"), class = "data.frame", row.names = c(NA, 
-30L))

dfa[c(1:12,19:30),]
#>    id TF PM1 PM2 PM3 PM4 PM5
#> 1   0 NA  NA  NA  NA  NA  NA
#> 2   0  0  -3  NA  NA  NA  NA
#> 3   0 NA  NA  NA  NA  NA  NA
#> 4   0  0  -2  NA  NA  NA  NA
#> 5   0  0  -1  NA  NA  NA  NA
#> 6   0  1   1  -3  NA  NA  NA
#> 7   0  1   2  -2  NA  NA  NA
#> 8   0  1   3  -1  NA  NA  NA
#> 9   0 NA  NA  NA  NA  NA  NA
#> 10  0  0  NA   1  NA  NA  NA
#> 11  0  0  NA   2  NA  NA  NA
#> 12  1 NA  NA  NA  NA  NA  NA
#> 19  1 NA  NA  NA  NA  NA  NA
#> 20  7 NA  NA  NA  NA  NA  NA
#> 21  7  0  -2  NA  NA  NA  NA
#> 22  7  0  -1  NA  NA  NA  NA
#> 23  7  1   1  -1  NA  NA  NA
#> 24  7  0  NA   1  -2  NA  NA
#> 25  7  0  NA   2  -1  NA  NA
#> 26  7  1  NA  NA   1  -1  NA
#> 27  7  0  NA  NA  NA   1  -1
#> 28  7  1  NA  NA  NA  NA   1
#> 29  7  1  NA  NA  NA  NA   2
#> 30  7  1  NA  NA  NA  NA   3

【问题讨论】:

    标签: r count data-manipulation


    【解决方案1】:

    这确实是一个棘手的问题,我相信代码可以进一步改进。但是,我能够重现您的预期结果。请用您的生产数据尝试这种方法。如果OK,我稍后再补充说明。

    library(data.table)
    
    tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
      , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
    
    res <- tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
      rl == V1, PM := dn][rl == V1 + 1L, PM := up][
        , dcast(.SD, id + TF + rn ~ paste0("PM", V1), value.var = "PM")][
          df, on = .(rn, id, TF)][, -"rn"]
    res
    
        id TF PM1 PM2 PM3 PM4 PM5
     1:  0 NA  NA  NA  NA  NA  NA
     2:  0  0  -3  NA  NA  NA  NA
     3:  0 NA  NA  NA  NA  NA  NA
     4:  0  0  -2  NA  NA  NA  NA
     5:  0  0  -1  NA  NA  NA  NA
     6:  0  1   1  -3  NA  NA  NA
     7:  0  1   2  -2  NA  NA  NA
     8:  0  1   3  -1  NA  NA  NA
     9:  0 NA  NA  NA  NA  NA  NA
    10:  0  0  NA   1  NA  NA  NA
    11:  0  0  NA   2  NA  NA  NA
    12:  1 NA  NA  NA  NA  NA  NA
    13:  1  0  -3  NA  NA  NA  NA
    14:  1  0  -2  NA  NA  NA  NA
    15:  1  0  -1  NA  NA  NA  NA
    16:  1  1   1  NA  NA  NA  NA
    17:  1  1   2  NA  NA  NA  NA
    18:  1  1   3  NA  NA  NA  NA
    19:  1 NA  NA  NA  NA  NA  NA
    20:  7 NA  NA  NA  NA  NA  NA
    21:  7  0  -2  NA  NA  NA  NA
    22:  7  0  -1  NA  NA  NA  NA
    23:  7  1   1  -1  NA  NA  NA
    24:  7  0  NA   1  -2  NA  NA
    25:  7  0  NA   2  -1  NA  NA
    26:  7  1  NA  NA   1  -1  NA
    27:  7  0  NA  NA  NA   1  -1
    28:  7  1  NA  NA  NA  NA   1
    29:  7  1  NA  NA  NA  NA   2
    30:  7  1  NA  NA  NA  NA   3
        id TF PM1 PM2 PM3 PM4 PM5
    
    # verify results are identical
    identical(res, dfa)
    
    [1] TRUE
    

    如果每个组的更改超过 9 次,则应在调用 dcast() 时将 paste0("PM", V1) 替换为 dcast(),以确保正确排序 PM 列。

    说明

    tmp <- 
      # coerce to data.table
      setDT(df)[
        # create row id column (required for final join to get NA rows back in)
        , rn := .I][
          # ignore NA rows 
          !is.na(TF)][
            # number streaks of unique values within each group
            , rl := rleid(TF), by = id][
              # create ascending and descending counts for each streak
              # this is done once to avoid repeatedly creation of counts for each PM 
              # (slight performance gain)
              , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)]
    
    
    tmp[]
    
        id TF rn rl up dn
     1:  0  0  2  1  1 -3
     2:  0  0  4  1  2 -2
     3:  0  0  5  1  3 -1
     4:  0  1  6  2  1 -3
     5:  0  1  7  2  2 -2
     6:  0  1  8  2  3 -1
     7:  0  0 10  3  1 -2
     8:  0  0 11  3  2 -1
     9:  1  0 13  1  1 -3
    10:  1  0 14  1  2 -2
    11:  1  0 15  1  3 -1
    12:  1  1 16  2  1 -3
    13:  1  1 17  2  2 -2
    14:  1  1 18  2  3 -1
    15:  7  0 21  1  1 -2
    16:  7  0 22  1  2 -1
    17:  7  1 23  2  1 -1
    18:  7  0 24  3  1 -2
    19:  7  0 25  3  2 -1
    20:  7  1 26  4  1 -1
    21:  7  0 27  5  1 -1
    22:  7  1 28  6  1 -3
    23:  7  1 29  6  2 -2
    24:  7  1 30  6  3 -1
        id TF rn rl up dn
    

    对于下一步,我们需要每个组内的更改计数 V1

    tmp[, seq_len(max(rl) - 1L), by = .(id)]
    
       id V1
    1:  0  1
    2:  0  2
    3:  1  1
    4:  7  1
    5:  7  2
    6:  7  3
    7:  7  4
    8:  7  5
    

    现在,我们使用每个组的行创建所有可能更改的“笛卡尔连接”:

    # right join with count of changes within each group
    tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
      # copy descending counts to rows before the switch
      rl == V1, PM := dn][
        # copy ascending counts to rows after the switch
        rl == V1 + 1L, PM := up][]
    
        id TF rn rl up dn V1 PM
     1:  0  0  2  1  1 -3  1 -3
     2:  0  0  4  1  2 -2  1 -2
     3:  0  0  5  1  3 -1  1 -1
     4:  0  1  6  2  1 -3  1  1
     5:  0  1  7  2  2 -2  1  2
     6:  0  1  8  2  3 -1  1  3
     7:  0  0 10  3  1 -2  1 NA
     8:  0  0 11  3  2 -1  1 NA
     9:  0  0  2  1  1 -3  2 NA
    10:  0  0  4  1  2 -2  2 NA
    11:  0  0  5  1  3 -1  2 NA
    12:  0  1  6  2  1 -3  2 -3
    13:  0  1  7  2  2 -2  2 -2
    14:  0  1  8  2  3 -1  2 -1
    15:  0  0 10  3  1 -2  2  1
    16:  0  0 11  3  2 -1  2  2
    17:  1  0 13  1  1 -3  1 -3
    18:  1  0 14  1  2 -2  1 -2
    19:  1  0 15  1  3 -1  1 -1
    20:  1  1 16  2  1 -3  1  1
    21:  1  1 17  2  2 -2  1  2
    22:  1  1 18  2  3 -1  1  3
    23:  7  0 21  1  1 -2  1 -2
    24:  7  0 22  1  2 -1  1 -1
    25:  7  1 23  2  1 -1  1  1
    26:  7  0 24  3  1 -2  1 NA
    27:  7  0 25  3  2 -1  1 NA
    28:  7  1 26  4  1 -1  1 NA
    29:  7  0 27  5  1 -1  1 NA
    30:  7  1 28  6  1 -3  1 NA
    31:  7  1 29  6  2 -2  1 NA
    32:  7  1 30  6  3 -1  1 NA
    33:  7  0 21  1  1 -2  2 NA
    34:  7  0 22  1  2 -1  2 NA
    35:  7  1 23  2  1 -1  2 -1
    36:  7  0 24  3  1 -2  2  1
    37:  7  0 25  3  2 -1  2  2
    38:  7  1 26  4  1 -1  2 NA
    39:  7  0 27  5  1 -1  2 NA
    40:  7  1 28  6  1 -3  2 NA
    41:  7  1 29  6  2 -2  2 NA
    42:  7  1 30  6  3 -1  2 NA
    43:  7  0 21  1  1 -2  3 NA
    44:  7  0 22  1  2 -1  3 NA
    45:  7  1 23  2  1 -1  3 NA
    46:  7  0 24  3  1 -2  3 -2
    47:  7  0 25  3  2 -1  3 -1
    48:  7  1 26  4  1 -1  3  1
    49:  7  0 27  5  1 -1  3 NA
    50:  7  1 28  6  1 -3  3 NA
    51:  7  1 29  6  2 -2  3 NA
    52:  7  1 30  6  3 -1  3 NA
    53:  7  0 21  1  1 -2  4 NA
    54:  7  0 22  1  2 -1  4 NA
    55:  7  1 23  2  1 -1  4 NA
    56:  7  0 24  3  1 -2  4 NA
    57:  7  0 25  3  2 -1  4 NA
    58:  7  1 26  4  1 -1  4 -1
    59:  7  0 27  5  1 -1  4  1
    60:  7  1 28  6  1 -3  4 NA
    61:  7  1 29  6  2 -2  4 NA
    62:  7  1 30  6  3 -1  4 NA
    63:  7  0 21  1  1 -2  5 NA
    64:  7  0 22  1  2 -1  5 NA
    65:  7  1 23  2  1 -1  5 NA
    66:  7  0 24  3  1 -2  5 NA
    67:  7  0 25  3  2 -1  5 NA
    68:  7  1 26  4  1 -1  5 NA
    69:  7  0 27  5  1 -1  5 -1
    70:  7  1 28  6  1 -3  5  1
    71:  7  1 29  6  2 -2  5  2
    72:  7  1 30  6  3 -1  5  3
        id TF rn rl up dn V1 PM
    

    最后,中间结果从长格式改成宽格式。

    res <- 
      # create a "cartesian join" of all possible changes with the rows of each group
      tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
        # copy descending counts to rows before the switch
        rl == V1, PM := dn][
          # copy ascending counts to rows after the switch
          rl == V1 + 1L, PM := up][
            # reshape from wide to long with the change count as new columns
            , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
              # join with original df to get NA rows back in
              df, on = .(rn, id, TF)][
                # omit helper column
                , -"rn"]
    

    【讨论】:

    • 真的太棒了!我已经在我的 生产数据 上运行它,它就像一个魅力。当# 超过两位数时,唯一的问题是PM# 的排序。由于我不了解您的所有代码,因此我通过添加sprintf("%02d", ... ) 进行了临时修复,这样我就有了paste0("PM", sprintf("%02d",V1)。该解决方案也适用于我的生产数据(最多生成PM12)。这显然是我的错误。非常感谢您的回答!
    • 您可以将其更改/调整为 tidyverse for this Q(看起来与此非常相似)
    • @Uwe,我正在尝试对您的代码进行轻微修改。在花了一个多小时后,我正在寻求您的帮助。我不想再折叠/忽略NA's。这样的情况#3,在id == 0 中,当前被视为-3 只是NAPM1 中。这也会影响PM2,在id == 0 内,因此根本没有移位,因为移位由NA 分隔。你能指出我是如何进行这种修改的吗?如果你喜欢我很高兴发布一个新问题。提前感谢您的好意。
    • @EricFail,恐怕我手头没有立即的解决方案。我需要深入研究我自己的代码 + 我不确定我是否完全理解你所追求的。也许,我需要看到预期的结果,以确保我走在正确的轨道上。因此,发布一个新问题确实是个好主意。
    • @Uwe,非常感谢!根据您的建议,我现在发布了an updated version of the question。我会继续努力,看看我是否也能弄清楚。再次感谢!
    【解决方案2】:

    我认为笛卡尔连接是不必要的:

    library(data.table)
    tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
                     , `:=`(up = 1:.N, down = -.N:-1), by = .(id, rl)][
                     , `:=`(last = (rl == max(rl)) * (-down)), by = id]
    
    up   = dcast(tmp, rn ~ rl, value.var = 'up'  , fill = 0)
    down = dcast(tmp, rn ~ rl, value.var = 'down', fill = 0)
    last = dcast(tmp, rn ~ rl, value.var = 'last', fill = 0)
    
    rl.max = tmp[, max(rl)]
    res = down[, 2:rl.max] + up[, 3:(rl.max+1)] + last[, 2:rl.max]
    
    res[res == 0] = NA
    res[, rn := up$rn]
    
    setcolorder(res[df, on='rn'][,-'rn'], c('id','TF', 1:(rl.max-1)))[]
    #    id TF   1   2   3   4   5
    # 1:  0 NA  NA  NA  NA  NA  NA
    # 2:  0  0  -3  NA  NA  NA  NA
    # 3:  0 NA  NA  NA  NA  NA  NA
    # 4:  0  0  -2  NA  NA  NA  NA
    # 5:  0  0  -1  NA  NA  NA  NA
    # 6:  0  1   1  -3  NA  NA  NA
    # 7:  0  1   2  -2  NA  NA  NA
    # 8:  0  1   3  -1  NA  NA  NA
    # 9:  0 NA  NA  NA  NA  NA  NA
    #10:  0  0  NA   1  NA  NA  NA
    #11:  0  0  NA   2  NA  NA  NA
    #12:  1 NA  NA  NA  NA  NA  NA
    #13:  1  0  -3  NA  NA  NA  NA
    #14:  1  0  -2  NA  NA  NA  NA
    #15:  1  0  -1  NA  NA  NA  NA
    #16:  1  1   1  NA  NA  NA  NA
    #17:  1  1   2  NA  NA  NA  NA
    #18:  1  1   3  NA  NA  NA  NA
    #19:  1 NA  NA  NA  NA  NA  NA
    #20:  7 NA  NA  NA  NA  NA  NA
    #21:  7  0  -2  NA  NA  NA  NA
    #22:  7  0  -1  NA  NA  NA  NA
    #23:  7  1   1  -1  NA  NA  NA
    #24:  7  0  NA   1  -2  NA  NA
    #25:  7  0  NA   2  -1  NA  NA
    #26:  7  1  NA  NA   1  -1  NA
    #27:  7  0  NA  NA  NA   1  -1
    #28:  7  1  NA  NA  NA  NA   1
    #29:  7  1  NA  NA  NA  NA   2
    #30:  7  1  NA  NA  NA  NA   3
    #    id TF   1   2   3   4   5
    

    【讨论】:

    • 我正在尝试对您的代码稍作修改。在花了一个多小时后,我正在寻求您的帮助。我不想再折叠/忽略NA's。这样的情况#3,在id == 0 中,当前被视为-3 只是NAPM1 中。这也会影响PM2,在id == 0 内,因此根本没有移位,因为移位由NA 分隔。你能指出我是如何进行这种修改的吗?如果你喜欢我很高兴发布一个新问题。预先感谢您的好意。
    • 我现在发布了an updated version of the question。谢谢。
    猜你喜欢
    • 2018-06-26
    • 2018-03-29
    • 1970-01-01
    • 1970-01-01
    • 2020-07-22
    • 2022-11-16
    • 1970-01-01
    • 2018-11-27
    • 2020-02-14
    相关资源
    最近更新 更多