【问题标题】:creating aggredated pivot tables in R for multiple variables [duplicate]在R中为多个变量创建聚合数据透视表[重复]
【发布时间】:2020-10-10 11:47:36
【问题描述】:

我有 2 个数据集

data1=structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-9L))

第二个数据集

data2=structure(list(pred = structure(c(4L, 2L, 1L, 3L, 1L, 5L, 7L, 
6L, 6L, 1L), .Label = c("0.0226902365684509", "0.0326902365684509", 
"0.0826902365684509", "0.1211001253128052", "0.411001253128052", 
"0.611001253128052", "0.64564001253128052"), class = "factor")), class = "data.frame", row.names = c(NA, 
-10L))

如何为每个变量 x1-x13 创建带有 data1$predictiondata2$pred 的数据透视表 通过手段激怒了

例如变量 x1 的期望输出

x1  mean_prediction mean_pred
7   0.31            0.056
6   0.14            0.569

输出中的 I.E 必须在一个数据框中包含 12 个表。而不是分离的数据帧。 怎么做?

#更新

structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor"), pred = c(0.121100125, 0.032690237, 
    0.022690237, 0.082690237, 0.022690237, 0.411001253, 0.645640013, 
    0.611001253, 0.611001253)), class = "data.frame", row.names = c(NA, 
-9L))

【问题讨论】:

  • data1$predictiondata2$pred 有什么关系?
  • @RonakShah,这些数据可以通过 cbind 绑定。没关系。我只是复制 pred 并粘贴到新数据集中
  • data1 有 9 行,data2 有 10 行。我认为我们不能直接cbind
  • @RonakShah,这是我的错误,请检查编辑后的帖子,我做了 cbind ,这里一定是正确的

标签: r dplyr data.table


【解决方案1】:

您可以获取长格式数据,为每个列名和值分组,并为predictionpred 列取mean

library(dplyr)

df %>%
  mutate(across(x1:x13, as.character), 
         prediction = as.numeric(as.character(prediction))) %>%
  tidyr::pivot_longer(cols = x1:x13) %>%
  group_by(name, value) %>%
  summarise(across(c(prediction, pred), mean, na.rm = TRUE))

#   name  value prediction   pred
#   <chr> <chr>      <dbl>  <dbl>
# 1 x1    6         0.150  0.570 
# 2 x1    7         0.320  0.0564
# 3 x10   0         0.257  0.186 
# 4 x10   1         0.197  0.628 
# 5 x11   120       0.348  0.0327
# 6 x11   130       0.404  0.0227
# 7 x11   140       0.369  0.0227
# 8 x11   200       0.412  0.0827
# 9 x11   410       0.0713 0.411 
#10 x11   460       0.132  0.628 
# … with 39 more rows

【讨论】:

    【解决方案2】:

    接受您的输入。为了便于处理,先把因子转换成字符。

    data1$x6<-as.character(data1$x6)
    data1$x7<-as.character(data1$x7)
    

    然后,将值列从字符因子转换为数字,就像它们在代码中一样。

    data1$prediction<-as.numeric(as.character.numeric_version(data1$prediction))
    data2$pred<-as.numeric(as.character.numeric_version(data2$pred))
    

    由于data2代码中的额外列,删除数据的最后一行使用。

    data1$pred<-data2$pred[1:nrow(data1)]
    

    这里是一个函数,它获取列的字符串名称,然后创建一个包含输出所需数据的小标题,将分组列值转换为字符(因为有些列包含数字,有些包含字符串,这消除了没有得到所有分组值的错误),然后输出列中值的分组均值。

    get_column_break_down<-function(colname, df=data1) {
      res_df<-tibble(group = as.character(df[,grep(paste0('^',colname,'$'), names(df))]),
      prediction=df$prediction,
      pred=df$pred)
      return(res_df %>%
           group_by(group) %>%
           summarize(mean_prediction = mean(prediction),
                     mean_pred = mean(pred)) %>%
           mutate(predictor = colname) %>%
           ungroup() %>%
           select(predictor, group, mean_prediction, mean_pred))
    }
    

    data1获取列名的向量

    colname_vec<-names(data1[,1:14])
    

    使用第一列名称创建初始 data.frame 或在本例中为 tibble

    df<-get_column_break_down(colname_vec[1])
    

    循环遍历剩余的列,将行绑定到df 变量,或者将行堆叠在一起。

    for(n in colname_vec[2:length(colname_vec)]) {
        df<-bind_rows(df, get_column_break_down(n))
    }
    

    最后是输出。

    注意,因为它是一个小标题,所以它是对数字进行四舍五入,如果您检查环境中的 df 变量或将其写入 csv,数字将与您的数字非常匹配。

    df
    # A tibble: 49 x 4
     predictor group mean_prediction mean_pred
     <chr>     <chr>           <dbl>     <dbl>
     1 x1        6               0.150    0.570 
     2 x1        7               0.320    0.0564
     3 x2        1               0.197    0.611 
     4 x2        2               0.257    0.191 
     5 x3        1               0.242    0.243 
     6 x3        2               0.249    0.368 
     7 x4        156             0.225    0.351 
     8 x4        238             0.281    0.152 
     9 x5        0               0.205    0.349 
    10 x5        1               0.380    0.0577
    # ... with 39 more rows
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2013-09-08
      • 1970-01-01
      • 1970-01-01
      • 2019-06-12
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多