在R中为多个变量创建聚合数据透视表[重复]答案

【问题标题】：creating aggredated pivot tables in R for multiple variables [duplicate]在R中为多个变量创建聚合数据透视表[重复]
【发布时间】：2020-10-10 11:47:36
【问题描述】：

我有 2 个数据集

data1=structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-9L))

第二个数据集

data2=structure(list(pred = structure(c(4L, 2L, 1L, 3L, 1L, 5L, 7L, 
6L, 6L, 1L), .Label = c("0.0226902365684509", "0.0326902365684509", 
"0.0826902365684509", "0.1211001253128052", "0.411001253128052", 
"0.611001253128052", "0.64564001253128052"), class = "factor")), class = "data.frame", row.names = c(NA, 
-10L))

如何为每个变量 x1-x13 创建带有 data1$prediction 和 data2$pred 的数据透视表通过手段激怒了

例如变量 x1 的期望输出

x1  mean_prediction mean_pred
7   0.31            0.056
6   0.14            0.569

输出中的 I.E 必须在一个数据框中包含 12 个表。而不是分离的数据帧。怎么做？

#更新

structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor"), pred = c(0.121100125, 0.032690237, 
    0.022690237, 0.082690237, 0.022690237, 0.411001253, 0.645640013, 
    0.611001253, 0.611001253)), class = "data.frame", row.names = c(NA, 
-9L))

【问题讨论】：

data1$prediction 和 data2$pred 有什么关系？
@RonakShah，这些数据可以通过 cbind 绑定。没关系。我只是复制 pred 并粘贴到新数据集中
data1 有 9 行，data2 有 10 行。我认为我们不能直接cbind。
@RonakShah，这是我的错误，请检查编辑后的帖子，我做了 cbind ，这里一定是正确的

标签： r dplyr data.table

【解决方案1】：

您可以获取长格式数据，为每个列名和值分组，并为prediction 和pred 列取mean。

library(dplyr)

df %>%
  mutate(across(x1:x13, as.character), 
         prediction = as.numeric(as.character(prediction))) %>%
  tidyr::pivot_longer(cols = x1:x13) %>%
  group_by(name, value) %>%
  summarise(across(c(prediction, pred), mean, na.rm = TRUE))

#   name  value prediction   pred
#   <chr> <chr>      <dbl>  <dbl>
# 1 x1    6         0.150  0.570 
# 2 x1    7         0.320  0.0564
# 3 x10   0         0.257  0.186 
# 4 x10   1         0.197  0.628 
# 5 x11   120       0.348  0.0327
# 6 x11   130       0.404  0.0227
# 7 x11   140       0.369  0.0227
# 8 x11   200       0.412  0.0827
# 9 x11   410       0.0713 0.411 
#10 x11   460       0.132  0.628 
# … with 39 more rows

【讨论】：

【解决方案2】：

接受您的输入。为了便于处理，先把因子转换成字符。

data1$x6<-as.character(data1$x6)
data1$x7<-as.character(data1$x7)

然后，将值列从字符因子转换为数字，就像它们在代码中一样。

data1$prediction<-as.numeric(as.character.numeric_version(data1$prediction))
data2$pred<-as.numeric(as.character.numeric_version(data2$pred))

由于data2代码中的额外列，删除数据的最后一行使用。

data1$pred<-data2$pred[1:nrow(data1)]

这里是一个函数，它获取列的字符串名称，然后创建一个包含输出所需数据的小标题，将分组列值转换为字符（因为有些列包含数字，有些包含字符串，这消除了没有得到所有分组值的错误），然后输出列中值的分组均值。

get_column_break_down<-function(colname, df=data1) {
  res_df<-tibble(group = as.character(df[,grep(paste0('^',colname,'$'), names(df))]),
  prediction=df$prediction,
  pred=df$pred)
  return(res_df %>%
       group_by(group) %>%
       summarize(mean_prediction = mean(prediction),
                 mean_pred = mean(pred)) %>%
       mutate(predictor = colname) %>%
       ungroup() %>%
       select(predictor, group, mean_prediction, mean_pred))
}

从data1获取列名的向量

colname_vec<-names(data1[,1:14])

使用第一列名称创建初始 data.frame 或在本例中为 tibble。

df<-get_column_break_down(colname_vec[1])

循环遍历剩余的列，将行绑定到df 变量，或者将行堆叠在一起。

for(n in colname_vec[2:length(colname_vec)]) {
    df<-bind_rows(df, get_column_break_down(n))
}

最后是输出。

注意，因为它是一个小标题，所以它是对数字进行四舍五入，如果您检查环境中的 df 变量或将其写入 csv，数字将与您的数字非常匹配。

df
# A tibble: 49 x 4
 predictor group mean_prediction mean_pred
 <chr>     <chr>           <dbl>     <dbl>
 1 x1        6               0.150    0.570 
 2 x1        7               0.320    0.0564
 3 x2        1               0.197    0.611 
 4 x2        2               0.257    0.191 
 5 x3        1               0.242    0.243 
 6 x3        2               0.249    0.368 
 7 x4        156             0.225    0.351 
 8 x4        238             0.281    0.152 
 9 x5        0               0.205    0.349 
10 x5        1               0.380    0.0577
# ... with 39 more rows

【讨论】：