通过排列进行多组测试答案

【问题标题】：Multiple groups tests via permutation通过排列进行多组测试
【发布时间】：2018-01-04 21:26:48
【问题描述】：

我有一个 df，其中包含与两个实验相关的两组值（value_1 和 value_2）。

一个实验包含两组（0 和 1），另一个实验包含三个组（0,1,2）。

test    group   Value_1    Value_2
AA      0           15.1    11.2
AA      0           12.4    8.6
AA      1           9.6     22.5
AA      1           10.2    22
BB      0           12.11   11
BB      0           14      1.2
BB      1           11      13.2
BB      1           12.3    9
BB      2           14.2    12
BB      2           15      13

df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, 
-10L))

我想对 value_1 和 value_2 应用置换测试 - 按测试、按组 - 覆盖：

两组案例（0 vs 1）和
超过 2 个案例（0 对 1、0 对 2、1 对 2）。

到目前为止我所做的 - 分解成碎片是：

当组只有两个时，我只需应用 oneway.test()：

df %>%
  filter(test %in% 'AA') -> df_test_aa

df_test_aa_value_1 <- oneway.test(df_test_aa$Value_1~df_test_aa$group)
df_test_aa_value_1$p.value
[1] 0.2011234


df_test_aa_value_2 <- oneway.test(df_test_aa$Value_2~df_test_aa$group)
df_test_aa_value_2$p.value
[1] 0.05854026

当组数超过 2 时，我会测试所有可能的排列：

前 0 对 1：

df %>% filter(test %in% 'BB' & group %in% c(0,1)) -> df_test_bb_01

df_test_bb_01_value_1 <- oneway.test(df_test_bb_01$Value_1~df_test_bb_01$group)
df_test_bb_01_value_1$p.value
[1] 0.3585415

df_test_bb_01_value_2 <-    oneway.test(df_test_bb_01$Value_2~df_test_bb_01$group)
df_test_bb_01_value_2$p.value
[1] 0.4848446

然后 0 对 2：

 df %>%
 filter(test %in% 'BB' & group %in% c(0,2)) -> df_test_bb_02

 df_test_bb_02_value_1 <-       oneway.test(df_test_bb_02$Value_1~df_test_bb_02$group)
 df_test_bb_02_value_1$p.value
 [1] 0.3246012

 df_test_bb_02_value_2 <- oneway.test(df_test_bb_02$Value_2~df_test_bb_02$group)
 df_test_bb_02_value_2$p.value
 [1] 0.4142838

然后 1 对 2：

 df %>%
  filter(test %in% 'BB' & group %in% c(1,2)) -> df_test_bb_12

  df_test_bb_12_value_1 <- oneway.test(df_test_bb_12$Value_1~df_test_bb_12$group)
  df_test_bb_12_value_1$p.value
  [1] 0.08105404


  df_test_bb_12_value_2 <- oneway.test(df_test_bb_12$Value_2~df_test_bb_12$group)
  df_test_bb_12_value_2$p.value
  [1] 0.6245713

因此，我希望获得一个看起来像这样的 df：

test value  p_value_2sided  hypothesis
AA  Value_1   0.201         0,1
AA  Value_2   0.059         0,1
BB  Value_1   0.359         0,1
BB  Value_1   0.325         0,2
BB  Value_1   0.081         1,2
BB  Value_2   0.485         0,1
BB  Value_2   0.414         0,2
BB  Value_2   0.625         1,2

感谢您的提示！

【问题讨论】：

标签： r dplyr ab-testing

【解决方案1】：

df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, -10L))

library(tidyverse)

# reshape dataset
df2 = df %>% gather(value, v, -test, -group)

# function to compute p value
# vectorized version
f = function(t,val,x1,x2) {
  (df2 %>% 
     filter(test == t & value == val & group %in% c(x1,x2)) %>% 
     oneway.test(v~group, data = .))$p.value }
f = Vectorize(f)

df2 %>% 
  distinct(test, group, value) %>%       # get unique combinations
  group_by(test, value) %>%              # for each test and value
  nest() %>%                             # nest rest of columns
  mutate(d = map(data, ~data.frame(t(combn(.$group, 2)))),
         hypothesis = map(d, ~paste0(.$X1, ",", .$X2))) %>%  # get pairs/combinations of values
  unnest(d, hypothesis) %>%              # unnest data
  mutate(pval = f(test, value, X1, X2))  # apply vectorised function to get p value

# # A tibble: 8 x 6
#   test   value   hypothesis    X1    X2   pval
#   <fctr> <chr>   <chr>      <int> <int>  <dbl>
# 1 AA     Value_1 0,1            0     1 0.201 
# 2 BB     Value_1 0,1            0     1 0.359 
# 3 BB     Value_1 0,2            0     2 0.325 
# 4 BB     Value_1 1,2            1     2 0.0811
# 5 AA     Value_2 0,1            0     1 0.0585
# 6 BB     Value_2 0,1            0     1 0.485 
# 7 BB     Value_2 0,2            0     2 0.414 
# 8 BB     Value_2 1,2            1     2 0.625

如果你真的不需要X1 和X2，你可以删除它们。但是，通过这种方式，您（也）可以将它们作为单独的数字变量，以防您希望在分析的稍后阶段在另一个过程中使用它们（例如，过滤特定组）。

【讨论】：

【解决方案2】：

嗯，这不漂亮，但是……

df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, 
-10L))

as.data.frame(
  do.call(rbind, by(df, factor(df$test), function(x) {
  h <- t(combn(unique(x$group), 2))
  p <- apply(h, 1, function(y) {
    with(x[x$group %in% y, ], {
      c(oneway.test(Value_1 ~ group)$p.value,
        oneway.test(Value_2 ~ group)$p.value)
      })
    })

  h <- rep(paste(h[, 1], h[, 2], sep = ","), each = 2)

  cbind(test = as.character(x$test[1]), value = c("Value_1", "Value_2"), p_value = as.vector(p), hypothesis = h)
  }))
)


  test   value            p_value hypothesis
1   AA Value_1  0.201123366107666        0,1
2   AA Value_2 0.0585402590546805        0,1
3   BB Value_1  0.358541470571387        0,1
4   BB Value_2  0.484844587956832        0,1
5   BB Value_1  0.324601180998953        0,2
6   BB Value_2  0.414283756097153        0,2
7   BB Value_1 0.0810540380817137        1,2
8   BB Value_2  0.624571310834221        1,2

【讨论】：

它有效，我理解代码。我的问题是：如果我添加一个额外的变量（比如说 Value_3），代码就不再起作用了，因为它没有添加正确的组合......让我编辑我的问题
你确定吗？它对我来说很好用，只要确保也将 h <- rep(paste(h[, 1], h[, 2], sep = ","), each = 2) 设置为 each = 3 ！