如何计算更快的月度加权平均值答案

【问题标题】：How to calculate a faster days-in-month weighted average如何计算更快的月度加权平均值
【发布时间】：2020-03-12 16:50:30
【问题描述】：

电能表通常不在月初和月底开始和结束，而是与日历不均匀地重叠。我正在尝试使用加权平均逻辑来排列这些读取日期并计算单个月份的值。我附上了我的代码示例，它构建了一个与我正在使用的数据集相似的数据集。每行是一个单独的能量计。每 3 列代表一个开始日期和结束日期，以及该时间段使用的能量值。

我一直在处理数十万行，这个过程需要 20 多分钟。我很想能够使用data.table，但我对它太陌生了，鉴于数据的列结构，我不确定如何让seq.Date 工作。

# Making the Fake Dataset
set.seed(123)
fake_rows = 10
{
  testdata <- replicate(fake_rows, {
    start_it <- as.Date('2019/01/01') + sample(-20:20, 1, T)
    track <- start <- end <- value <- c()
    for(i in 1:12){
      a <- seq.Date(start_it, length.out = sample(28:34,1), by="day")
      start[i] <- a[1]
      end[i] <- start_it <- a[length(a)]
      value[i] <- sample(1:200,1)
      track <- c(track, start[i], end[i], value[i])
    }
    return(track)
  })

  testdata <- as.data.frame(t(testdata)) 

  month_labels <- c(paste0("0",1:9), "10","11","12")
  start_dates <- sapply(month_labels, function(x) paste0("Start_Date_",x))
  end_dates <- sapply(month_labels, function(x) paste0("End_Date_",x))
  values <- sapply(month_labels, function(x) paste0("Value_",x))

  colnames(testdata) <- c(rbind(start_dates,end_dates,values))

  # replace columns with the dates
  for(i in c(start_dates, end_dates)){
    testdata[,i] <- as.Date(testdata[,i], origin = "1970-01-01")
  }

  testdata[2, 7:36] <- NA # some are missing dates and values
}

testdata

#   Start_Date_01 End_Date_01 Value_01 Start_Date_02 End_Date_02 Value_02
#1     2019-01-11  2019-02-13      179    2019-02-13  2019-03-17      195
#2     2018-12-20  2019-01-21      164    2019-01-21  2019-02-22       81
#3     2019-01-05  2019-02-02       69    2019-02-02  2019-03-04       63
#4     2018-12-28  2019-01-29       50    2019-01-29  2019-02-25       34
#5     2019-01-15  2019-02-16      199    2019-02-16  2019-03-17      151
#6     2019-01-15  2019-02-16       94    2019-02-16  2019-03-21       24
#7     2019-01-05  2019-02-07       54    2019-02-07  2019-03-07      137
#8     2019-01-16  2019-02-15      108    2019-02-15  2019-03-19      177
#9     2018-12-25  2019-01-25       16    2019-01-25  2019-02-27      125
#10    2019-01-09  2019-02-07       10    2019-02-07  2019-03-10       54

我采用了下面的 data.frame 方法：

library(data.table)
# for each row, determine what monthly values would be
output <- matrix(NA, nrow = nrow(testdata), ncol = 12)
month_cols <- as.character(1:12)

for(i in 1:nrow(testdata)){
  x <- y <- vector("list", 12)

  for(j in 1:12){
    if(!is.na(testdata[i, start_dates[j]])){
      # get the counts of days in each month within the meter read period
      x[[j]] <- table(month(seq.Date(testdata[i, start_dates[j]], testdata[i, end_dates[j]], "day")))
      # multiply the meter read value by days in each month (the numerator of a day wtd avg)
      y[[j]] <- testdata[i, values[j]] * x[[j]]      
    }
    months <- names(unlist(y))
    # day weighted average  = Σ(value x Days) / Σ(Days)
    final <- tapply(unlist(y), months, sum) / tapply(unlist(x), months, sum) 
    output[i,] <- final[match(month_cols, names(final))] # ordered in the case of missing months
  }
}

output

其中行是原始数据集的行，列表示从 1 月到 2 月的估计值，没有附加特定年份，因为我对跨月的所有值进行日加权，而不考虑年份。

#         [,1]      [,2]      [,3]      [,4]      [,5]      [,6]     [,7]      [,8]       [,9]    [,10]     [,11]     [,12]
# [1,] 140.77778 187.82759 127.03125  46.16129  28.50000  81.25806 125.8750  91.00000  91.516129 120.1250 108.80645  32.87500
# [2,] 135.46875  81.00000        NA        NA        NA        NA       NA        NA         NA       NA        NA 164.00000
# [3,]  80.61290  63.41379  92.75000  91.77419  39.96970  45.74194  87.6875  20.87500 100.838710 196.4375  86.00000 154.43750
# [4,]  48.50000  31.10345  30.81250 130.35484 128.43750  48.70968 117.8125  27.81250  55.322581 137.0312 123.38710 145.65714
# [5,] 142.03571 177.48276 137.40625 106.48387 102.53125 116.00000  86.0000 102.25000 112.032258 153.4375 183.29032  96.50000
# [6,]  88.34286  62.62069  52.53125 126.87097 132.62500 128.19355 157.9688 103.43750   9.612903  30.6250  93.67742 131.09375
# [7,]  62.91429 116.96552  67.46875  72.83871 102.25000 171.32258 178.5000 112.50000  38.645161 131.0000 127.22581  96.43750
# [8,]  86.08696 141.31034 129.06250  35.77419  97.00000 122.93548 146.3125 128.18750 151.161290 199.1250 172.90323  74.75000
# [9,]  39.84375 119.13793  70.00000 180.64516  85.12500  49.64516 116.5000  92.28125 117.225806  46.1250  27.35484  29.16129
#[10,]  37.77143  43.37931  90.43750  51.45161  25.71875 120.22581 111.6562 126.81250 123.193548  46.0625  84.74194  97.53125

如何提高性能？

【问题讨论】：

只是为了确保我理解，如果日期范围是 2019-01-11 2019-02-13，您想将其计算为 1 月的 21 天和 2 月的 13 天。因此，179 的值被划分为 21/34 * 179 分配给一月和 13/34 * 179 分配给二月？
@eipi10 是的，完全正确。我写它的方式是使用另一组列中新二月天的权重，包括不同年份的二月。但数学如你所说
示例解决方案对我不起作用； month 函数来自什么包，润滑？ month_cols 在哪里定义？你能分享一下output 的样子吗？
抱歉，现在修复。 month 似乎是我正在使用的 data.table 函数。
我的代码也可能错误地重复计算天数...sum(testdata[1,values]) 和 sum(output[1,]) 应该几乎相同，假设 12 次读取不超过 365 天的数据...

标签： r data.table

【解决方案1】：

这里是data.table + lubridate 方法。

我的输出与您想要的输出不同。但我不确定哪一个是正确的;-)

library( data.table )
library(lubridate)
#make data.table
setDT( testdata )
#insert row_id
testdata[, row_id := .I ]
#melt
dt <- melt( testdata, 
            id.vars = "row_id",
            measure.vars = patterns( 
              Start_Date = "^Start", 
              End_Date = "^End", 
              Value = "^Value" ) )
#drop the meaningless variable
dt[, variable := NULL ]
#Calculate daily value
dt[, value_day := Value / as.numeric( difftime( End_Date,  Start_Date, units = "days") ) ]
#create a table per day over the entire period
dt.days <- data.table( date = seq( min( dt$Start_Date, na.rm = TRUE ), 
                                   max( dt$End_Date, na.rm = TRUE ), 
                                   by = "1 days" ) )
#left join
answer <- dt[ dt.days, on = .(Start_Date <= date, End_Date >= date ), mult = "all", allow.cartesian = TRUE ]
#and summarise by monthly period
dcast(
  answer[, 
         .(month.total = sum( value_day ) ), 
         by = .(row_id, month = sprintf( "%02d", lubridate::month( Start_Date ) ) ) ],
  row_id ~ month )

输出

#    row_id        01        02        03        04        05        06        07        08         09        10        11        12
# 1:      1 115.40909 168.01515 130.37946  47.84375  28.72581  78.94456 131.56250  98.65323  98.550777 142.37037 114.04421  34.49194
# 2:      2 135.46875  55.68750        NA        NA        NA        NA        NA        NA         NA        NA        NA  61.50000
# 3:      3  85.80844  61.62857  96.01290 103.51613  48.06810  45.21408  85.07879  22.26667 103.366667 196.43750  80.62500 149.92045
# 4:      4  49.09028  33.21481  32.18387 131.71635 141.57241  53.88889 138.18287  27.51420  52.765152 136.45833 116.20833 159.31250
# 5:      5 124.28125 167.18966 145.70474 102.10985 102.34897 117.64627  96.46305 113.35714 120.302381 167.90000 202.06667 107.49537
# 6:      6  99.73750  56.45455  58.86532 131.43098 135.22944 131.92857 156.06061 100.60065   9.714286  31.29032  97.69077 143.41494
# 7:      7  69.83699 119.09740  70.61364  71.99413 108.17419 163.96667 195.71717 120.27778  38.170833 131.30000 127.80000 105.21839
# 8:      8  66.60000 131.43750 132.48661  39.21429 114.31111 131.80208 149.83266 135.40601 149.424569 219.95833 186.07407  81.61905
# 9:      9  39.41838 105.23569  81.37566 200.00000  96.94355  47.00587 115.61039 101.48333 119.333333  44.72727  26.52456  30.92325
# 10:     10  42.05603  40.73637  93.35484  52.61958  27.69195 113.56970 108.27273 131.72121 134.688889  52.06452  82.30242  97.53125

【讨论】：

感谢 data.table 方法 - 需要熟悉这些函数 pattern 和 "^"。我已经用输出上方的一些文本更新了我的问题，描述了我正在尝试做的事情。让我知道这是否可以澄清。
^ 用于正则表达式。请查看?data.table::melt 以了解patterns() 在此处的使用 *向下滚动）。是否有必要将数据保留为您描述的输出格式？
我将答案更新为类似于您想要的输出的内容。但是值不同。也许您可以找出原因..
我今天要测试一下——我猜这可能是因为（如果我没看错的话）加入是完成的，好像结束日期包括在内（就像开始日期一样）当我将它们视为排他性的。每个人的结果都与我不同，所以开始认为我需要重新检查我的数学 lol
如果要排除 End_Date，请在上面的左连接中使用 End_Date > date..

【解决方案2】：

我希望将数据重新整形为长格式，然后执行一些矢量化计算，然后将数据重新整形回最终形式会更快，即使重新整形步骤会增加一点开销。

这是一种tidyverse 方法，使用fuzzyjoin 在每个期间添加月末休息时间（如果适用）。这种方式应该可以灵活地适应跨越 1、2、3 个月或更长时间的时期。

首先，这里将数据重新整形为长格式。

library(tidyverse)
library(lubridate)

# Helper function to gather all columns whose names start with "header"
extract_cols <- function(header, col_name) {
  testdata %>%
  select(starts_with(header)) %>%
  mutate(meter_num = row_number()) %>%
  pivot_longer(-meter_num, 
               names_to = "period", names_pattern = paste0(header, "(.*)"), 
               values_to = col_name)
}

# Using the helper function, make one long table, with a row for each meter-period.
testdate_long <- extract_cols("Start_Date_", "Start") %>%
                   left_join(extract_cols("End_Date_", "End")) %>%
                   left_join(extract_cols("Value_", "Val")) %>%
                   mutate(ttl_days = (End-Start)/ddays(1))

现在看起来像这样：

> testdate_long
# A tibble: 120 x 6
   meter_num period Start      End          Val ttl_days
       <int> <chr>  <date>     <date>     <int>    <dbl>
 1         1 01     2018-12-23 2019-01-24    82       32
 2         1 02     2019-01-24 2019-02-26   189       33
 3         1 03     2019-02-26 2019-03-25   106       27
 4         1 04     2019-03-25 2019-04-27   111       33
 5         1 05     2019-04-27 2019-05-27   192       30
 6         1 06     2019-05-27 2019-06-26   136       30
 7         1 07     2019-06-26 2019-07-27    21       31
 8         1 08     2019-07-27 2019-08-29    50       33
 9         1 09     2019-08-29 2019-09-25    66       27
10         1 10     2019-09-25 2019-10-28   178       33
# … with 110 more rows

现在，记录每个时期内的所有月末会很有用。大多数时候会有一个，但有时会没有（End 与 Start 是同一个月），有时会有两个或更多（例如 1 月 31 日至 3 月 2 日，或超过 1 个月的时间段） )。一种方法是使用 fuzzyjoin 来添加介于 Start 和 End 之间的所有月末。

# Make a list of all the month end dates that might come into play
month_ends <- data.frame(
  month_ends = seq.Date(
    ceiling_date(min(testdate_long$Start, na.rm = T), "month"),
    ceiling_date(max(testdate_long$End, na.rm = T), "month"), by = "months") - 1)

# Add all the month-ends and calculate the length of all the date ranges, from Start to (if applicable) month-end to End:

testdate_full <- testdate_long %>%
  mutate(ttl_days = (End-Start)/ddays(1)) %>%
  fuzzyjoin::fuzzy_left_join(month_ends,
                             by = c("Start" = "month_ends",
                                    "End"   = "month_ends"),  
                             match_fun = list(`<`, `>`)) %>%
  pivot_longer(c(Start, End, month_ends),
               names_to = "type", values_to = "date") %>%
  arrange(meter_num, period, date) %>% 
  # Edited below to remove "group_by"
  mutate(mo = month(date),
         days = if_else(meter_num == lag(meter_num) & period == lag(period),
                    (date - lag(date))/ddays(1),
                    NA_real_)) %>% 
  filter(!is.na(date), !is.na(days))

然后很容易获得加权值并将它们传播到所需的输出格式。

testdate_full %>%
  mutate(val_wtd = Val * days / ttl_days) %>%
  count(meter_num, mo, wt = val_wtd, name = "val_wtd") %>%
  spread(mo, val_wtd)

（我发现这会产生与您的输出不同的值，我会回来检查是否遗漏了什么。无论如何，我希望这能让您更接近一个性能更高的解决方案。）这里有一点检查。从testdata 的第一行开始，似乎有两个时期部分发生在一月。

> testdata[1, c(1:6)]
  Start_Date_01 End_Date_01 Value_01 Start_Date_02 End_Date_02 Value_02
1    2018-12-23  2019-01-24       82    2019-01-24  2019-02-26      189

第一个时期在 1 月份的 32 天中有 24 天 (75%)。 0.75 x 82 = 61.5

第二个周期在 1 月的 33 天中有 7 天 (21.21%)。 0.21 x 189 = 40.1

所以我希望在这里看到 101.6，这与此处产生的结果相对应。

【讨论】：

感谢这种 tidyverse 方法 - 长文件现在变成了 +470 万行 - 我认为我的计算机可能已经崩溃，但会在我的时间检查完成时通知您。现在它不是一个循环的方法，更难看出它是在滴答作响还是我毁了 R
是的，不幸的是我的电脑在应用这个逻辑时崩溃了。一种方法可能是让我将我的数据分成更多的小块（约 50k 行），然后运行......
我认为 group_at 步骤是一个瓶颈。现在通过删除它，它可以处理更大的数据（例如，你的假数据最多 20k 行，我能得到的最多）。

【解决方案3】：

这种方式非常快，但假设了一些关于数据的事情：

j 列中的每条记录大约是同一个一般月份。以一月份为例。
j 中的月份范围是在一般月份之前或之后的月份。对于 1 月，这意味着将考虑 12 月至 2 月。

这将data.table 用于year() 和month() 函数。

library(data.table)
# vectors identifying the column positions 
starts <- grep('Start', names(testdata))
ends <- grep('End', names(testdata))
values <- grep('Value', names(testdata))

# calculating how many days are in each period and the value per day
days_per_period <- do.call(cbind, (lapply(testdata[ends] - testdata[starts] + 1, as.numeric)))
val_per_day <- as.matrix(testdata[values]) / days_per_period

# seq the dates based on the minimum and maximum 
min_date <- as.Date(min(unlist(testdata[starts], use.names = F), na.rm = T),  origin ="1970-01-01")
max_date <- as.Date(max(unlist(testdata[ends], use.names = F), na.rm = T),  origin ="1970-01-01")

start_months <- seq.Date(as.Date(paste(year(min_date), month(min_date), '1', sep = '/')),
                         as.Date(paste(year(max_date), month(max_date), '1', sep = '/')),
                         by = 'month')

# calculates the days for each month
## WARNING
## Assumes that for any column j that the only possible months would be
## the months prior and after the "current" month
lag_mon <- mapply(function(met_start, month_start) pmax(month_start - met_start - 1 , 0), testdata[starts], start_months[2:13])
lead_mon <- mapply(function(met_end, month_end) pmax(met_end - (month_end-1) , 0), testdata[ends], start_months[3:14])
cur_mon <- days_per_period - lag_mon - lead_mon 

lag_mon_val <- cbind(lag_mon * val_per_day, 0, 0)
cur_mon_val <- cbind(0, cur_mon * val_per_day, 0)
lead_mon_val <- cbind(0, 0, lead_mon * val_per_day)

# makes an array to deal with NAs. Otherwise, this would be
# simply lag_mon_val + cur_mon_val + lead_mon_val
arrs <- array(c(lag_mon_val, cur_mon_val, lead_mon_val),
              dim = c(dim(lag_mon_val)[1], dim(lag_mon_val)[2], 3),
              dimnames = list(NULL, as.character(start_months), c('lag', 'cur', 'lead')))

apply(arrs, 2, rowSums, na.rm = T)

结果：（7 列被截断）：

     2018-12-01 2019-01-01 2019-02-01 2019-03-01 2019-04-01 2019-05-01 2019-06-01
 [1,]   0.000000 110.558824  162.98663  126.31661   46.28945   27.84848   76.54545
 [2,]  54.666667 133.878788   56.45455    0.00000    0.00000    0.00000    0.00000
 [3,]   0.000000  64.241379   59.62959   93.00403   99.87500   45.09375   42.45037
 [4,]   4.545455  47.883117   32.82949   27.61694  126.42500  141.24839   47.02304
 [5,]   0.000000 102.515152  161.91818  141.02121   99.04545   99.25000  113.91667
 [6,]   0.000000  48.424242   54.75223   56.85924  127.14076  131.03043  127.67189
 [7,]   0.000000  42.882353  115.04868   68.36308   69.76838  104.71169  159.04934
 [8,]   0.000000  55.741935  127.34897  128.35737   37.87430  110.24885  127.36797
 [9,]   3.000000  35.058824  104.36975   73.05419  195.68966   97.64009   41.92279
[10,]   0.000000   7.666667   39.45833   90.43750   50.93750   26.79032  110.20968

【讨论】：

感谢您提供答案 - 今天将查看 jt。我会说 j 列实际上是反向顺序的（2019 年 9 月到 2018 年 8 月），但并非所有行都从同一个月开始。很快就会看看这个
第二部分的逻辑是真的吗？即月份范围始终在连续3个月以内？
任何 j 的范围是动态的，大约跨越 33 天 - 尚未测试此代码
谢谢。而且我并不是想不耐烦——只是想弄清楚如何尝试修改

【解决方案4】：

这是另一种使用“IRanges”包以（此处为日期）间隔执行计算的方法。

首先，为了方便，将数据分开：

col_starts = testdata[grep("^Start_Date", names(testdata))]
col_ends = testdata[grep("^End_Date", names(testdata))]
col_values = testdata[grep("^Value", names(testdata))]

这里很明显，“testdata”包含定义为 [col_starts, col_ends] 的每条记录（行）的 length(col_starts) (= 12) 日期间隔。接下来，我们需要找到跨越“testdata”时间段的日历月的间隔：

# Find starting month
start_date = as.POSIXlt(do.call(min, c(na.rm = TRUE, col_starts)))
start_date$mday = 1L
start_date = as.Date(start_date)

# Find ending month (+ 1)
end_date = as.POSIXlt(do.call(max, c(na.rm = TRUE, col_ends)))
end_date$mon = end_date$mon + 1L
end_date$mday = 1L
end_date = as.Date(end_date)

# Create a series of months between first and last
month_series = seq(start_date, end_date, by = "1 month")

# create the intervals of existing months
month_starts = head(month_series, -1)
month_ends = tail(month_series, -1) - 1

现在我们有 [month_starts, month_ends] 间隔，它们与“testdata”中定义的间隔重叠。这些区间定义为“IRanges”对象：

library(IRanges)
month_ints = IRanges(as.numeric(month_starts), as.numeric(month_ends))

现在我们需要遍历 (length(col_starts)) 的“testdata”连续间隔，并根据它们与“month_ints”的重叠来传播它们的值。首先分配将保存每个月的值的输出，这些值将在通过所有间隔时更新：

output = matrix(0, nrow = nrow(col_starts), ncol = length(month_series), 
                dimnames = list(c(), as.character(month_series)))

现在遍历“testdata”的所有区间：

for(j in 1:ncol(col_starts)) {

    # prepare (remove "date" attribute and handle NAs) for IRanges
    s = as.numeric(col_starts[, j])
    e = as.numeric(col_ends[, j] - 1)  # '- 1' to exclude ending day
    hasNA = !complete.cases(s, e)
    s[hasNA] = 0L
    e[hasNA] = 0L

    # an IRanges object containing current period intervals for all rows
    current_ints = IRanges(s, e)

    # average the interval's value across its days
    value_by_day = col_values[, j] / width(current_ints)

    # count amount of overlap between current (j) intervals and month intervals
    overlaps = findOverlaps(current_ints, month_ints)
    overlaps_width = width(pintersect(current_ints[from(overlaps)], month_ints[to(overlaps)]))

    # update "output"
    index = cbind(from(overlaps), to(overlaps))
    output[index] = output[index] + (value_by_day[from(overlaps)] * overlaps_width)
}

“输出”保存从“testdata”开始到结束日期每月的值。无论年份如何，都可以有效地每月汇总价值：

which_month = as.numeric(format(as.Date(colnames(output)), "%m"))
monthly_values = matrix(0L, nrow = nrow(output), ncol = max(which_month), 
                        dimnames = list(NULL, 1:max(which_month)))
for(j in 1:ncol(output)) 
    monthly_values[, which_month[j]] =  monthly_values[, which_month[j]] + output[, j]

最终输出：

#> round(monthly_values, 1)
#          1     2     3     4     5     6     7     8     9    10    11    12
# [1,] 115.2 162.6 124.3  46.1  27.4  78.5 126.8  95.4  95.6 139.0 109.0  31.3
# [2,] 130.3  53.2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  61.5
# [3,]  81.0  59.2  93.9 100.4  43.3  43.9  82.4  21.7 100.2 190.2  80.6 147.2
# [4,]  47.5  32.0  31.8 127.6 136.3  53.2 132.3  26.7  51.7 132.1 112.5 154.4
# [5,] 122.6 161.0 140.5  98.3  99.8 113.3  93.9 109.6 117.0 163.4 196.3 100.5
# [6,]  97.0  53.5  58.1 127.5 130.7 128.3 151.1  95.9   9.5  30.8  95.9 137.8
# [7,]  66.6 117.5  65.7  70.5 105.5 160.3 189.8 114.1  35.2 130.8 122.3 101.8
# [8,]  65.3 127.8 127.0  37.1 113.9 125.0 147.2 129.1 146.8 213.8 178.7  76.3
# [9,]  38.9 101.4  79.9 192.9  92.5  46.7 111.5  99.3 114.3  43.1  25.7  29.8
#[10,]  39.2  40.4  91.6  49.2  26.8 112.7 103.6 129.0 129.6  48.2  81.6  94.1

还有一张支票：

#> all(rowSums(monthly_values) == rowSums(col_values, na.rm = TRUE))
#[1] TRUE

【讨论】：