使用 r 中的 data.table 计算加权平均值，不包括每行的某些行答案

【问题标题】：Calculating weighted averages excluding some rows for each row using data.table in r使用 r 中的 data.table 计算加权平均值，不包括每行的某些行
【发布时间】：2020-03-07 18:19:39
【问题描述】：

我一直在尝试使用 data.table 获得一列加权平均值，其中排除了每行的一些行。

在以下示例中，FIPS 是 ID 变量，STATE 是组变量。我想计算不包括同一州的相邻县的加权平均值。以及在其他州。

我知道如何实现它，但我想有一种更有效的方法。我不熟悉使用 data.table 的按行操作。有任何想法吗？先感谢您。

library(data.table)
rm(list=ls())

set.seed(920410)
DT <- data.table(FIPS =1:21, STATE = LETTERS[1:2], value=1:3, weight=2:7); DT
DT[, nbs := list(list(sample(1:21, 3))), by= names(DT)]

for(i in 1:nrow(DT)){
 DT$neighbor_sum_in_the_same_state[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE == DT$STATE[i], value*weight])
 DT$neighbor_sum_in_other_states[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE != DT$STATE[i], value*weight])
 }

【问题讨论】：

标签： r loops data.table weighted-average rowwise

【解决方案1】：

也许有等价的data.table，这是tidyverse中的一种方式

library(dplyr)
library(purrr)

DT %>%
  group_by(STATE) %>%
  mutate(val1 = map_dbl(nbs, ~{inds <- FIPS %in% .x; 
                               sum(value[inds] * weight[inds])})) %>%
  ungroup() %>%
  mutate(val2 = map2_dbl(nbs, STATE, ~{inds <- FIPS %in% .x & STATE != .y; 
                                        sum(value[inds] * weight[inds])}))

# A tibble: 21 x 9
#    FIPS STATE value weight nbs       OP_val1 OP_val2  val1  val2
#   <int> <chr> <int>  <int> <list>      <int>   <int> <dbl> <dbl>
# 1     1 A         1      2 <int [3]>      14      21    14    21
# 2     2 B         2      3 <int [3]>      11      12    11    12
# 3     3 A         3      4 <int [3]>       0      17     0    17
# 4     4 B         1      5 <int [3]>       5      14     5    14
# 5     5 A         2      6 <int [3]>      16       0    16     0
# 6     6 B         3      7 <int [3]>      26      12    26    12
# 7     7 A         1      2 <int [3]>      14       5    14     5
# 8     8 B         2      3 <int [3]>      27       2    27     2
# 9     9 A         3      4 <int [3]>       2      42     2    42
#10    10 B         1      5 <int [3]>       6      14     6    14
# … with 11 more rows

其中OP_val1 和OP_val2 是在OP 中运行for 循环后的输出。

数据

set.seed(920410)
DT <- data.table(FIPS =1:21, STATE = LETTERS[1:2], value=1:3, weight=2:7)
DT[, nbs := list(list(sample(1:21, 3))), by= names(DT)]

for(i in 1:nrow(DT)){
   DT$OP_val1[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE == DT$STATE[i], value*weight])
   DT$OP_val2[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE != DT$STATE[i], value*weight])
}

【讨论】：

【解决方案2】：

这是data.table 中的一个选项，通过在执行连接之前将其转换为长格式：

#convert into long format i.e. unlist the nbs column
nm <- c("FIPS","STATE","value", "weight")
DT_long <- DT[, .(nbs=unlist(nbs)), nm]

#look for neighbours in same STATE and FIPS
DT_long[, neighbor_sum_in_the_same_state := 
    .SD[.SD, on=.(FIPS=nbs, STATE), sum(x.value[1L] * x.weight[1L]), by=.EACHI]$V1]

#look for all in same FIPS but exclude those with same STATE
DT_long[, neighbor_sum_in_other_states := 
        .SD[.SD, on=.(FIPS=nbs), sum(x.value[x.STATE!=i.STATE][1L] * x.weight[x.STATE!=i.STATE][1L]), by=.EACHI]$V1]

#produce desired output
DT_long[, lapply(.SD, sum, na.rm=TRUE), nm, 
    .SDcols=c("neighbor_sum_in_the_same_state", "neighbor_sum_in_other_states")]

输出：

    FIPS STATE value weight neighbor_sum_in_the_same_state neighbor_sum_in_other_states
 1:    1     A     1      2                             14                           21
 2:    2     B     2      3                             11                           12
 3:    3     A     3      4                              0                           17
 4:    4     B     1      5                              5                           14
 5:    5     A     2      6                             16                            0
 6:    6     B     3      7                             26                           12
 7:    7     A     1      2                             14                            5
 8:    8     B     2      3                             27                            2
 9:    9     A     3      4                              2                           42
10:   10     B     1      5                              6                           14
11:   11     A     2      6                             12                           26
12:   12     B     3      7                             11                            2
13:   13     A     1      2                             12                           11
14:   14     B     2      3                              5                           24
15:   15     A     3      4                             12                           26
16:   16     B     1      5                             21                           24
17:   17     A     2      6                              4                            5
18:   18     B     3      7                              6                           14
19:   19     A     1      2                             14                            5
20:   20     B     2      3                             11                           12
21:   21     A     3      4                             12                           27
    FIPS STATE value weight neighbor_sum_in_the_same_state neighbor_sum_in_other_states

【讨论】：

【解决方案3】：

谢谢你们俩 :) 那些帮助！

在尝试了各种方法后，我编写了以下代码。以下代码在不使用循环的情况下计算不包括同一州以及其他州的相邻县的值的加权平均值。


DT[, weighted_avg_nonneighboring_counties_in_same_state := 
  weighted.mean(
  DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE == .BY[2], value],
  DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE == .BY[2], weight],
  na.rm=TRUE),
  by=.(FIPS,STATE)][,
     weighted_avg_nonneighboring_counties_in_other_states := 
  weighted.mean(
  DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE != .BY[2], value],
  DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE != .BY[2], weight], 
  na.rm=TRUE),  
  by=.(FIPS,STATE)]

【讨论】：