【问题标题】:how to convert data frame R code to data.table code如何将数据框 R 代码转换为 data.table 代码
【发布时间】:2019-10-05 09:42:34
【问题描述】:

我有一个包含赛马数据的数据集。如果缺少 avgl6rank 值,我编写了一个函数来计算每行马在其比赛日期的平均最近 6 个等级。该函数运行速度非常慢,所以我想通过使用 data.table 来加快速度。但是我有麻烦如何在每行的日期之前使用 rdate 为每行创建组,并按horsenum 分组。我可以在哪里插入日期过滤

jc.data[is.na(avgl6rank), avgl6rank:= round(mean(tail(rank, 6),na.rm = T)), by = horsenum]

预期输出与输入数据框相同,如果我们能找到该马在该行日期之前的最近 6 或更少的比赛记录,只需在 avgl6rank 列中填写缺失值,并计算平均排名。

Avg6RankF <- function(df){
    if (!is.na(df["avgl6rank"])) {
      return(df["avgl6rank"])
    } else {
      tmp <- subset(jc.data, horsenum == df["horsenum"] & rdate < df["rdate"])
      if (nrow(tmp) > 0) {
        return(round(mean(tail(tmp$rank, 6),na.rm = TRUE)))
      } else {
        return(NA)
      }
    }
  }

  jc.data['avgl6rank'] <- apply(jc.data, 1, Avg6RankF)
 structure(list(index = c(64L, 577L, 33704L, 34538L, 35753L, 36119L, 
36641L, 38259L, 38484L, 39060L, 40507L, 41326L, 41814L, 41938L, 
42537L, 43006L, 43073L, 43354L, 44056L, 44186L, 44615L, 44665L, 
45385L, 46060L, 46636L, 47134L, 47526L, 48030L, 48176L, 48799L, 
50485L, 51167L, 51660L, 52006L, 52703L, 53352L, 53806L, 54366L, 
55055L, 56041L, 56496L, 56991L, 57718L, 58303L, 59036L, 59717L, 
60211L, 61142L, 61776L, 62348L, 63042L, 63755L, 64474L, 65063L, 
66355L, 66663L, 67179L, 67415L, 68015L, 68345L, 69616L, 71067L, 
72162L, 74472L, 75555L, 76018L, 76754L, 77463L, 79022L, 79740L, 
81273L, 81885L, 83136L, 83468L, 84202L, 84937L, 85681L, 87446L, 
88375L, 89242L), rdate = structure(c(13765L, 13782L, 15025L, 
15049L, 15089L, 15101L, 15115L, 15228L, 15235L, 15253L, 15298L, 
15322L, 15340L, 15343L, 15360L, 15375L, 15375L, 15385L, 15406L, 
15409L, 15424L, 15424L, 15451L, 15472L, 15494L, 15508L, 15522L, 
15536L, 15591L, 15614L, 15669L, 15690L, 15706L, 15717L, 15738L, 
15760L, 15774L, 15791L, 15815L, 15843L, 15858L, 15872L, 15893L, 
15963L, 15990L, 16012L, 16026L, 16054L, 16071L, 16089L, 16109L, 
16130L, 16152L, 16169L, 16211L, 16222L, 16236L, 16243L, 16257L, 
16334L, 16372L, 16418L, 16453L, 16526L, 16561L, 16575L, 16596L, 
16617L, 16715L, 16740L, 16792L, 16813L, 16852L, 16862L, 16883L, 
16904L, 16928L, 16988L, 17072L, 17100L), class = c("IDate", "Date"
)), rid = c(5L, 1L, 2L, 4L, 3L, 2L, 5L, 1L, 1L, 1L, 3L, 2L, 3L, 
1L, 5L, 1L, 6L, 1L, 1L, 4L, 1L, 5L, 5L, 4L, 2L, 5L, 6L, 6L, 6L, 
5L, 6L, 5L, 7L, 5L, 4L, 6L, 6L, 3L, 4L, 6L, 6L, 4L, 4L, 6L, 6L, 
5L, 6L, 5L, 7L, 5L, 6L, 3L, 4L, 4L, 5L, 2L, 6L, 5L, 9L, 6L, 6L, 
9L, 9L, 3L, 3L, 3L, 4L, 5L, 3L, 6L, 2L, 2L, 4L, 3L, 6L, 9L, 9L, 
7L, 3L, 3L), horsenum = c("D350", "D350", "M133", "M133", "M133", 
"M133", "M133", "M133", "M133", "M133", "M350", "M133", "M350", 
"M133", "M350", "M133", "M350", "M133", "M133", "M350", "M133", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350", 
"M350", "M350", "M350"), rank = c(12, 9, 9, 11, 10, 10, 12, 14, 
14, 6, 10, 9, 10, 11, 4, 6, 12, 6, 10, 9, 14, 10, 9, 5, 3, 1, 
1, 10, 11, 8, 10, 9, 4, 1, 3, 7, 2, 7, 7, 4, 1, 5, 3, 13, 7, 
6, 3, 2, 4, 6, 5, 3, 6, 4, 6, 6, 1, 1, 7, 7, 11, 7, 6, 3, 3, 
4, 8, 14, 1, 11, 10, 8, 10, 1, 10, 11, 11, 11, 10, 11), avgl6rank = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 6, 6, 7, 8, 9, 9, 7, 8, 8, 8, 9, 9)), row.names = c(NA, 
-80L), .internal.selfref = <pointer: 0x000001e914901ef0>, class = "data.frame")

【问题讨论】:

  • 查看最新data.table版本中的frollapply、frollsum和frollmean

标签: r dplyr data.table


【解决方案1】:

可以使用latset data.table(1.12.4版本,2天前发布),支持滚动方式。

在此处查看 #27:https://github.com/Rdatatable/data.table/blob/master/NEWS.md

library(data.table)
setDT(DT)

#use keys to sort DT by date by horsenum
setkey( DT, horsenum, rdate )
#use the new `frollmean()` function on a rolling window with length 6
DT[ is.na( avgl6rank ), avgl6rank_2 := frollmean( rank, 6L ), by = .( horsenum )][]

【讨论】:

  • 滚动窗口只支持定长吧?如果我想使用从 1 到 6 的滚动窗口长度怎么办?
  • @tiezhuetc 它可以支持1:6 并且会给你所有6个窗口。输出将是list
  • 当我计算平均6个等级时,我不想将当前等级包含在6个等级中,frollmean可以这样做吗?
  • 那么您可能需要一个辅助列,将排名列移动 1,然后在该列上执行 frollmean...查看 data.table::shift()
  • 如果你需要一个可变长度的窗口大小,那么使用adaptive=TRUE,阅读手册,有一个例子可以涵盖这一点。 frollmean 从 1.12.2 开始。
【解决方案2】:

使用dplyrpurrr::map2_dbl 你可以试试:

library(dplyr)

df %>%
  arrange(rdate) %>%
  group_by(horsenum) %>%
  mutate(avgl = purrr::map2_dbl(row_number(), avgl6rank, 
         ~ if(is.na(.y)) mean(rank[max(1,.x - 6):.x], na.rm = TRUE) else .y))

#   index rdate        rid horsenum jname     tname    pricemoney  rank avgl6rank  avgl
#   <int> <date>     <int> <chr>    <chr>     <chr>    <chr>      <dbl>     <dbl> <dbl>
# 1    64 2007-09-09     5 D350     S Dye     C Fownes NA            12        NA 12   
# 2   577 2007-09-26     1 D350     W M Lai   C Fownes NA             9        NA 10.5 
# 3 40507 2011-11-20     3 M350     D Beadman J Moore  NA            10        NA 10   
# 4 41814 2012-01-01     3 M350     D Beadman J Moore  NA            10        NA 10   
# 5 42537 2012-01-21     5 M350     D Beadman J Moore  NA             4        NA  8   
# 6 43073 2012-02-05     6 M350     T Clark   J Moore  NA            12        NA  9   
# 7 44186 2012-03-10     4 M350     N Callan  J Moore  NA             9        NA  9   
# 8 44665 2012-03-25     5 M350     T Clark   J Moore  NA            10        NA  9.17
# 9 45385 2012-04-21     5 M350     J Lloyd   J Moore  NA             9        NA  9.14
#10 46060 2012-05-12     4 M350     Y T Cheng J Moore  NA             5        NA  8.43
# … with 56 more rows

【讨论】:

    猜你喜欢
    • 2021-06-09
    • 2012-04-25
    • 2018-11-05
    • 1970-01-01
    • 2020-09-02
    • 2022-01-14
    • 2012-11-23
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多