在 R 中使用 purr::map 启动功能答案

【问题标题】：boot function with purr::map in R在 R 中使用 purr::map 启动功能
【发布时间】：2021-06-29 21:05:47
【问题描述】：

我正在研究使用引导包引导两个样本 t 测试。在基因表达矩阵中，我想比较条件之间的基因，我的目标是找到表达的基因。我有一个 5*12 的矩阵（5 个控制，7 个处理和 5 个基因），首先我将此数据矩阵转换为 tibble 格式作为两个长向量，以便理解 tibble 结构并使我更容易。：

library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(magrittr)
library(boot)

##Gene Expression matrix
Exp.mat<- read.table( header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

##Vectorized format from matrix
Vec_Ex.Mat <- as_tibble(t(Exp.mat))
Vec_Ex.Mat$Cond <- as.factor(c(rep("1", 5), rep("2", 7)))
Vec_Ex.Mat <- Vec_Ex.Mat %>% gather(Var, Val, -Cond)
Vec_Ex.Mat<- Vec_Ex.Mat[, c(2, 3, 1)]
colnames(Vec_Ex.Mat) <- c("Gene", "Exp", "Cond")
head(Vec_Ex.Mat)
#> # A tibble: 6 x 3
#>   Gene    Exp Cond 
#>   <chr> <dbl> <fct>
#> 1 Gene1 1.68  1    
#> 2 Gene1 0.322 1    
#> 3 Gene1 1.72  1    
#> 4 Gene1 1.91  1    
#> 5 Gene1 1.27  1    
#> 6 Gene1 1.68  2

##Created nested tibble
Nested_Ex.Mat <- Vec_Ex.Mat %>%
  dplyr::group_by(Gene) %>%
  tidyr::nest()

head(Nested_Ex.Mat)
#> # A tibble: 5 x 2
#> # Groups:   Gene [5]
#>   Gene  data                 
#>   <chr> <list>               
#> 1 Gene1 <tibble[,2] [12 x 2]>
#> 2 Gene2 <tibble[,2] [12 x 2]>
#> 3 Gene3 <tibble[,2] [12 x 2]>
#> 4 Gene4 <tibble[,2] [12 x 2]>
#> 5 Gene5 <tibble[,2] [12 x 2]>

## Function for bootstrap
bootFun <- function(df, f) {
  n <- nrow(df)
  idx <- which(df[, 2] == 2)
  idy <- which(df[, 2] == 1)
  nx <- length(idx)
  ny <- length(idy)
  new.df <- df
  new.df[idx, 1] <- df[idx, 1] - mean(df[idx, 1]) + mean(df[, 1])
  new.df[idy, 1] <- df[idy, 1] - mean(df[idy, 1]) + mean(df[, 1])
  df <- new.df
  MX <- sum(df[idx, 1] * f[idx])/sum(f[idx])
  SX <- sum(df[idx, 1]^2 * f[idx])/sum(f[idx]) - MX^2
  SX <- nx * SX/(nx - 1)
  MY <- sum(df[idy, 1] * f[idy])/sum(f[idy])
  SY <- sum(df[idy, 1]^2 * f[idy])/sum(f[idy]) - MY^2
  SY <- ny * SY/(ny - 1)
  SXY <- sqrt((SX/nx) + (SY/ny))
  (MX -MY)/SXY
}

##Bootstrap analysis with boot package using purrr::map
Nested_Ex.Mat %<>%
  dplyr::mutate(booted = purrr::map(.x=data, ~ boot::boot(data= .x, sim = "ordinary", statistic = bootFun,R = 5,stype = "f", strata=.x[, 2])))
#> Error: Problem with `mutate()` input `booted`.
#> x 'list' object cannot be coerced to type 'double'
#> i Input `booted` is `purrr::map(...)`.
#> i The error occurred in group 1: Gene = "Gene1".

^{由reprex package (v1.0.0) 于 2021-04-06 创建}

我不明白的是我是否正确使用了索引，我不知道如何将这些数据与引导包引入 tibble 矢量化格式。在示例here 中，分析了单个列，它正在工作。但是我想通过引导包对每个基因使用strata选项，在tibble数据中有两列。是否有可能摆脱这种代码负载，或者我们可以通过更短的代码和正确的索引使函数更高效？你能分享你的知识和建议吗？谢谢。

【问题讨论】：

请提供一个可重现的最小示例 (stackoverflow.com/help/minimal-reproducible-example)

标签： r nested purrr tibble

【解决方案1】：

首先我遇到了上述错误。之后，我在引导功能中更改了地层。（“strata = .x$cond”而不是strata=.x[, 2]）。然后它在第一个 boot.fun 函数中返回带有 mean 函数的 NA 。我将 boot.fun 中的均值更改为 colMeans。它现在正在工作。我也会用 matrxix 运行它以测试准确性。

library(tidyverse)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
library(boot)

Exp.mat<- read.table(header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

Exp.mat.new <- Exp.mat %>% 
  # Convert row names to a column
  rownames_to_column('gene') %>% 
  # "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
  pivot_longer(cols = -gene,
               names_to = 'cond',
               values_to = 'exp') %>% 
  # Fix condition
  mutate(cond = case_when(
    str_detect(cond, pattern = 'C') ~ '1',
    TRUE ~ '2')) %>% 
  # Nest 
  group_by(gene) %>% 
  nest()

bootFun.tbl <- function(df,f) {
  n <- nrow(df)
  idx <- which(df[, 1] == 2) #select cond = 2 (treatment)
  idy <- which(df[, 1] == 1) #select cond = 1 (control)
  nx <- length(idx)
  ny <- length(idy)
  new.df <- df
  new.df[idx, 2] <- df[idx, 2] - colMeans(df[idx, 2]) + colMeans(df[, 2]) 
  new.df[idy, 2] <- df[idy, 2] - colMeans(df[idy, 2]) + colMeans(df[, 2])
  df <- new.df
  MX <- sum(df[idx, 2] * f[idx])/sum(f[idx])
  SX <- sum(df[idx, 2]^2 * f[idx])/sum(f[idx]) - MX^2
  SX <- nx * SX/(nx - 1)
  MY <- sum(df[idy, 2] * f[idy])/sum(f[idy])
  SY <- sum(df[idy, 2]^2 * f[idy])/sum(f[idy]) - MY^2
  SY <- ny * SY/(ny - 1)
  SXY <- sqrt((SX/nx) + (SY/ny))
  (MX -MY)/SXY
}

Exp.mat.new <- Exp.mat.new %>% 
  # Perform  bootstrap test
  mutate(boot.fun = map(.x = data,
                        ~boot::boot(data=.x, sim = "ordinary", 
                                    statistic = bootFun.tbl, R = 5,
                                    stype = "f", strata = .x$cond)))
head(Exp.mat.new)
#> # A tibble: 5 x 3
#> # Groups:   gene [5]
#>   gene  data                  boot.fun
#>   <chr> <list>                <list>  
#> 1 Gene1 <tibble[,2] [12 × 2]> <boot>  
#> 2 Gene2 <tibble[,2] [12 × 2]> <boot>  
#> 3 Gene3 <tibble[,2] [12 × 2]> <boot>  
#> 4 Gene4 <tibble[,2] [12 × 2]> <boot>  
#> 5 Gene5 <tibble[,2] [12 × 2]> <boot>
head(Exp.mat.new$boot.fun)
#> [[1]]
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary", 
#>     stype = "f", strata = .x$cond)
#> 
#> 
#> Bootstrap Statistics :
#>          original    bias    std. error
#> t1* -6.729777e-16 -1.237088    2.189771
#> 
#> [[2]]
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary", 
#>     stype = "f", strata = .x$cond)
#> 
#> 
#> Bootstrap Statistics :
#>         original    bias    std. error
#> t1* 5.264653e-16 0.4360011   0.4969442
#>

^{由reprex package (v2.0.0) 于 2021-04-08 创建}

【讨论】：

【解决方案2】：

我不确定您为什么要引导 t 检验。运行t.test 函数似乎更容易。这是我的代码：

加载包

library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)

获取数据

Exp.mat<- read.table(header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

检查数据

head(Exp.mat)
#>             C1        C2         C3        C4        C5       T1        T2
#> Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389
#> Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023
#> Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405
#> Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747
#> Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895
#>              T3        T4        T5       T6        T7
#> Gene1 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
#> Gene2 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
#> Gene3 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
#> Gene4 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
#> Gene5 0.9986083 0.0000000 0.7091586 1.288114 2.0487426

格式化数据

Exp.mat.new <- Exp.mat %>% 
        # Convert row names to a column
        rownames_to_column('gene') %>% 
        # "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
        pivot_longer(cols = -gene,
                     names_to = 'cond',
                     values_to = 'exp') %>% 
        # Fix condition
        mutate(cond = case_when(
                str_detect(cond, pattern = 'C') ~ 'C',
                TRUE ~ 'T')) %>% 
        # Nest 
        group_by(gene) %>% 
        nest()

进行分析

Exp.mat.new <- Exp.mat.new %>% 
        # Perform t-test
        mutate(t_test = map(.x = data,
                            ~ t.test(.x$exp ~ .x$cond))) %>% 
        # Extract vital statistics
        mutate(C_minus_T_95CI = map(.x = t_test, # 95% CI of the mean difference between groups C and T
                                    ~ paste0(round(.x$conf.int[[1]], 3), ' to ', 
                                             round(.x$conf.int[[2]], 3))),
               p_value = map(.x = t_test, # p-value
                             ~ round(.x$p.value, 5))) %>% 
        # Unnest the data
        unnest(cols = c(C_minus_T_95CI, p_value)) %>% 
        # Arrange by p-value
        arrange(p_value)

打印数据帧

Exp.mat.new
#> # A tibble: 5 x 5
#> # Groups:   gene [5]
#>   gene  data              t_test  C_minus_T_95CI   p_value
#>   <chr> <list>            <list>  <chr>              <dbl>
#> 1 Gene2 <tibble [12 × 2]> <htest> -1.028 to -0.071  0.0288
#> 2 Gene3 <tibble [12 × 2]> <htest> -1.237 to 0.475   0.323 
#> 3 Gene4 <tibble [12 × 2]> <htest> -0.482 to 1.251   0.345 
#> 4 Gene5 <tibble [12 × 2]> <htest> -0.95 to 1.958    0.423 
#> 5 Gene1 <tibble [12 × 2]> <htest> -0.914 to 0.66    0.712

如果需要，打印每个单独 t 检验的完整结果

walk(.x = Exp.mat.new$t_test, 
     ~ print(.x))
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -2.6068, df = 8.8453, p-value = 0.02881
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.02805952 -0.07141179
#> sample estimates:
#> mean in group C mean in group T 
#>       0.3678434       0.9175791 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -1.0691, df = 6.4703, p-value = 0.3233
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.2367921  0.4754686
#> sample estimates:
#> mean in group C mean in group T 
#>        2.195439        2.576101 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = 0.99278, df = 9.7754, p-value = 0.3448
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.4816528  1.2514667
#> sample estimates:
#> mean in group C mean in group T 
#>        1.838916        1.454009 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = 0.86423, df = 5.5685, p-value = 0.4231
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.9503223  1.9584180
#> sample estimates:
#> mean in group C mean in group T 
#>        1.506200        1.002152 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -0.38483, df = 6.6963, p-value = 0.7123
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.9143974  0.6604509
#> sample estimates:
#> mean in group C mean in group T 
#>        1.381890        1.508863

^{由reprex package (v1.0.0) 于 2021-04-06 创建}

【讨论】：

努力。虽然我的问题还没有解决，但这是我收到的最好的帮助。感谢您的帮助和努力。我的目标是以 999 次重复执行引导 t 检验。如 Efron 和 Tibshirani (1993) 中的算法 16.2 中所述，并且可以使用引导功能。