【问题标题】:R dplyr across: Dynamically specifying arguments to functions t.test and varTestR dplyr cross:动态指定函数 t.test 和 varTest 的参数
【发布时间】:2021-07-28 23:52:33
【问题描述】:

我正在跨语句编写一些 dplyr。想要使用函数 t.test 和 varTest 创建一些 p 值。用于计算的 x= 列在 df_vars 中,mu= 和 sigma.squared= 参数值在 df_mu_sigma 中。

我需要的数据的硬编码版本在 df_sumry 中。如果在运行代码时变量名总是相同的,那么这样的就足够了。然而,事实并非如此。

我需要的非硬编码版本的开头是在 df_sumry2 中。但这并不能产生正确的结果,因为 mu= 和 sigma.squared= 的值不是动态指定的。 df_sumry2 中只有前两个 p 值是正确的。之后它们总是错误的,因为代码总是使用 mpg 变量的值。

我怎样才能始终如一地为 mu 和 sigma.squared 插入正确的值?

library(dplyr)
library(magrittr)
library(EnvStats)

df_vars <- mtcars %>%
  select(mpg, cyl, disp, hp)

set.seed(9302)

df_mu_sigma <- mtcars %>%
  select(mpg, cyl, disp, hp) %>%
  slice_sample(n = 12) %>%
  summarize(
    across(
      everything(),
      list(mean = mean,
           std = sd
      ))
  )

df_sumry <- df_vars %>%
  summarize(
    mpg_mean = mean(mpg),
    mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
    mpg_std = sd(mpg),
    mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
 
    cyl_mean = mean(cyl),
    cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
    cyl_std = sd(cyl),
    cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,

    disp_mean = mean(disp),
    disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
    disp_std = sd(disp),
    disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
 
    hp_mean = mean(hp),
    hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
    hp_std = sd(hp),
    hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
   )

vars_num <- names(df_vars)

df_sumry2 <- df_vars %>%
  summarize(
    across(
      all_of(vars_num),
      list(mean = mean,
           mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
           std = sd,
           std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
    )
  )

【问题讨论】:

    标签: r dynamic dplyr across


    【解决方案1】:

    我似乎想出了解决我自己问题的方法。我很乐意看到替代解决方案,因为它们可能比我的更好。

    library(dplyr)
    library(magrittr)
    library(EnvStats)
    
    df_vars <- mtcars %>%
        select(mpg, cyl, disp, hp)
    
    df_mu_sigma <- mtcars %>%
        select(mpg, cyl, disp, hp) %>%
        slice_sample(n = 12) %>%
        summarize(
            across(
                everything(),
                list(mean = mean,
                        std = sd
                ))
        )
    
    df_sumry <- df_vars %>%
        summarize(
            mpg_mean = mean(mpg),
            mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
            mpg_std = sd(mpg),
            mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
           
            cyl_mean = mean(cyl),
            cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
            cyl_std = sd(cyl),
            cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
           
            disp_mean = mean(disp),
            disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
            disp_std = sd(disp),
            disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
           
            hp_mean = mean(hp),
            hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
            hp_std = sd(hp),
            hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
        )
    
    vars_num <- names(df_vars)
    
    library(glue)
    
    df_sumry2 <- df_vars %>%
        summarize(
            across(
                all_of(vars_num),
                list(mean = mean,
                        mean_prob = function(x) {
                            mu_name <- glue("{ensym(x)}_mean")
                            t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
                        },
                        std = sd,
                        std_prob = function(x) {
                            sigma_name <- glue("{ensym(x)}_std")
                            varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
                        }
                )
            )
        )
    
    all.equal(df_sumry, df_sumry2)
    

    【讨论】:

      【解决方案2】:

      这并不比您的解决方案好多少,但我会使用 cur_column() 而不是 ensym() 来避免 quosures 处理。

      此外,将查询放在单独的函数中会使事情变得更整洁。

      最后,为了清楚起见,我将使用 lambda 函数而不是匿名函数。

      get_mu = function(suffix){
        df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
      }
      
      df_vars %>%
        summarize(
          across(
            all_of(vars_num),
            list(
              mean = mean,
              mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
              std = sd,
              std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
            )
          )
        ) %>% t() #just to format the output
      
      
      #                        [,1]
      # mpg_mean        20.09062500
      # mpg_mean_prob    0.01808550
      # mpg_std          6.02694805
      # mpg_std_prob     0.96094601
      # cyl_mean         6.18750000
      # cyl_mean_prob    0.10909740
      # cyl_std          1.78592165
      # cyl_std_prob     0.77092484
      # disp_mean      230.72187500
      # disp_mean_prob   0.17613878
      # disp_std       123.93869383
      # disp_std_prob    0.96381507
      # hp_mean        146.68750000
      # hp_mean_prob     0.03914858
      # hp_std          68.56286849
      # hp_std_prob      0.03459963
      
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 2013-10-04
        • 2013-12-23
        • 1970-01-01
        • 2017-07-02
        • 2020-11-04
        • 2017-06-15
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多