【问题标题】:Use dataframe as filtering criteria in R在 R 中使用数据框作为过滤条件
【发布时间】:2021-09-18 19:01:09
【问题描述】:

我正在编写一个函数,它使用数据帧作为包含模型输出的大数据帧的过滤标准。这些是过滤条件(作为 df):

      parameter value
1     alpha   0.1
2      beta   0.1
3       eta   0.1
4      zeta   0.1
5    lambda   0.5
6       phi   5.0
7     kappa   1.0

输入(值)

    structure(list(parameter = structure(c(1L, 2L, 3L, 7L, 5L, 6L, 
4L), .Label = c("alpha", "beta", "eta", "kappa", "lambda", "phi", 
"zeta"), class = "factor"), value = c(0.1, 0.1, 0.1, 0.1, 0.5, 
5, 1)), class = "data.frame", row.names = c(NA, -7L))

这就是“输出”df 的样子:

     time        w        x         y         z alpha beta eta zeta lambda phi kappa
1    0.0 10.00000 10.00000 10.000000 10.000000   0.1  0.1 0.1  0.1   0.95   5     1
1.1  0.1 10.00572 11.04680  9.896057  9.054394   0.1  0.1 0.1  0.1   0.95   5     1
1.2  0.2 10.01983 12.17827  9.592536  8.215338   0.1  0.1 0.1  0.1   0.95   5     1
1.3  0.3 10.04010 13.37290  9.112223  7.483799   0.1  0.1 0.1  0.1   0.95   5     1
1.4  0.4 10.06377 14.60353  8.489174  6.855626   0.1  0.1 0.1  0.1   0.95   5     1
1.5  0.5 10.08778 15.83982  7.764470  6.323152   0.1  0.1 0.1  0.1   0.95   5     1

输入(输出)

    structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6, 
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758, 
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849, 
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687, 
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334, 
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251, 
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018, 
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935, 
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841, 
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508, 
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339, 
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183, 
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949, 
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1, 
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9, 
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1, 
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1", 
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766", 
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")

所以应该是这样的:

filtered_outputs <- outputs %>% filter(all rows in column 1 == all values in column 2)

“参数”列下的名称对应于“输出”df 中的列名称。我希望这不是硬编码的,这样我就可以将任何过滤标准作为 df 输入,并且该函数将过滤“输出”。我想最好使用 dplyr 或 baseR。

【问题讨论】:

  • 根据您编写的 df 标准,您能否添加一个输出示例和一个期望的输出示例?
  • @s__ 现已添加
  • 请使用dput(df)将数据框粘贴到问题中,以便于复制和测试解决方案。谢谢。
  • 太棒了;也许过滤条件数据框也一样?样本输出数据框是否包含足够的值变化来测试过滤标准;从 alpha 到 kappa,输出数据是相同的。
  • 知道了。那么你想保留outputs 中与values 中的值完全匹配的行吗?

标签: r dataframe filter dplyr


【解决方案1】:

当数据为长格式时,我经常发现这类事情更容易 - 尽管这只是偏好:

outputs %>% 
    tidyr::pivot_longer(
        cols = -c(time, w, x, y, z, ode_outputs..iteration..), 
        names_to="parameter", values_to="value_truth"
    )  %>% 
    dplyr::left_join(filter_df) %>% 
    dplyr::group_by(time) %>%
    dplyr::filter(all(value == value_truth)) %>%
    dplyr::select(-value) %>% 
    tidyr::pivot_wider(
        names_from="parameter", 
        values_from="value_truth"
    )

输出:

# A tibble: 2 x 13
# Groups:   time [2]
   time     w     x     y     z ode_outputs..iteration.. alpha  beta   eta  zeta lambda   phi kappa
  <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>                    <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
1   0.2  10.0  12.2  9.59  8.22 NA                         0.1   0.1   0.1   0.1    0.5     5     1
2   0.3  10.0  13.4  9.11  7.48 NA                         0.1   0.1   0.1   0.1    0.5     5     1

数据:

outputs =     structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6, 
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758, 
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849, 
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687, 
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334, 
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251, 
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018, 
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935, 
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841, 
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508, 
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339, 
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183, 
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949, 
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1, 
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9, 
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1, 
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1", 
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766", 
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")

filter_df = fread('      parameter value
1     alpha   0.1
2      beta   0.1
3       eta   0.1
4      zeta   0.1
5    lambda   0.5
6       phi   5.0
7     kappa   1.0') %>% dplyr::select(-V1)

【讨论】:

    【解决方案2】:

    所以您想选择outputs 数据帧中与values 数据帧中的值匹配的所有行?

    这是使用 sweeprowSums 的基本 R 方法。

    result <- outputs[rowSums(sweep(outputs[as.character(values$parameter)], 2, 
                                    values$value, `!=`)) == 0, ]
    
    result
    
    #    time        w        x        y        z alpha beta eta zeta lambda phi kappa
    #1.2  0.2 10.01983 12.17827 9.592536 8.215338   0.1  0.1 0.1  0.1    0.5   5     1
    #1.3  0.3 10.04010 13.37290 9.112223 7.483799   0.1  0.1 0.1  0.1    0.5   5     1
    
    #    ode_outputs..iteration..
    #1.2                       NA
    #1.3                       NA
    

    【讨论】:

    • 它可以工作并且比 dplyr 语法更紧凑
    【解决方案3】:

    可能的dplyrtidyr 解决方案:

    通过将值数据框转换为宽格式来创建辅助数据框,并应用半连接以按所需条件进行过滤。

    您可以轻松地将其包含在一个连续的工作流程中,但我认为在单独的步骤中更容易理解。

    library(dplyr)
    library(tidyr)
    
    conditions <- 
      values %>% 
      pivot_wider(names_from = parameter, values_from = value)
    
    
    outputs %>%
      semi_join(conditions)
    
    #> Joining, by = c("alpha", "beta", "eta", "zeta", "lambda", "phi", "kappa")
    #>     time        w        x        y        z alpha beta eta zeta lambda phi
    #> 1.2  0.2 10.01983 12.17827 9.592536 8.215338   0.1  0.1 0.1  0.1    0.5   5
    #> 1.3  0.3 10.04010 13.37290 9.112223 7.483799   0.1  0.1 0.1  0.1    0.5   5
    #>     kappa ode_outputs..iteration..
    #> 1.2     1                       NA
    #> 1.3     1                       NA
    

    reprex package (v2.0.0) 于 2021-07-08 创建

    【讨论】:

      猜你喜欢
      • 2021-07-10
      • 1970-01-01
      • 2020-08-17
      • 2018-09-15
      • 2020-10-23
      • 2021-12-13
      • 2023-01-04
      • 1970-01-01
      • 2015-02-17
      相关资源
      最近更新 更多