【问题标题】:Convert from long to wide format from categorical data从分类数据从长格式转换为宽格式
【发布时间】:2020-01-13 08:38:49
【问题描述】:

有这样的分类数据:

 data.frame(id = c(1,2,3,4,5), stock1 = c(1,2,0,1,2), stock2 = c(0,1,0,1,1), end = c(0,1,3,0,3), start = c(2,3,0,1,0))
id stock1 stock2 end start
1  1      1      0   0     2
2  2      2      1   1     3
3  3      0      0   3     0
4  4      1      1   0     1
5  5      2      1   3     0

如何将它们从长格式转换为宽格式,在这种格式中,每列都会以特定名称显示是否存在?

预期输出示例:

data.frame(id = c(1,2,3,4,5), stock1_0 = c(0,0,1,0,0), stock1_1 = c(1,0,0,1,0), stock1_2 = c(0,1,0,0,1), stock2_0 = c(1,0,1,0,0), stock2_1 = c(0,1,0,0,0), end_0 = c(1,0,0,1,0), end_1 = c(0,1,0,0,0), end_3 = c(0,0,1,0,1), start_0 = c(0,0,1,0,1), start_1 = c(0,0,0,1,0), start_2 = c(1,0,0,0,0), start_3 = c(0,1,0,0,0))
id stock1_0 stock1_1 stock1_2 stock2_0 stock2_1 end_0 end_1 end_3 start_0 start_1 start_2 start_3
1  1        0        1        0        1        0     1     0     0       0       0       1       0
2  2        0        0        1        0        1     0     1     0       0       0       0       1
3  3        1        0        0        1        0     0     0     1       1       0       0       0
4  4        0        1        0        0        0     1     0     0       0       1       0       0
5  5        0        0        1        0        0     0     0     1       1       0       0       0

【问题讨论】:

    标签: r


    【解决方案1】:

    你可以使用model.matrix

    data.frame(dat[1],
               do.call(cbind, lapply(seq(dat)[-1], function(x) 
                 `colnames<-`(m <- model.matrix( ~ as.factor(dat[[x]]) - 1), 
                              paste(names(dat[x]), seq_len(ncol(m)), sep="_")))))
    
    
    #   id stock1_1 stock1_2 stock1_3 stock2_1 stock2_2 end_1 end_2 end_3 start_1
    # 1  1        0        1        0        1        0     1     0     0       0
    # 2  2        0        0        1        0        1     0     1     0       0
    # 3  3        1        0        0        1        0     0     0     1       1
    # 4  4        0        1        0        0        1     1     0     0       0
    # 5  5        0        0        1        0        1     0     0     1       1
    #   start_2 start_3 start_4
    # 1       0       1       0
    # 2       0       0       1
    # 3       0       0       0
    # 4       1       0       0
    # 5       0       0       0
    

    数据:

    dat <- structure(list(id = c(1, 2, 3, 4, 5), stock1 = c(1, 2, 0, 1, 
    2), stock2 = c(0, 1, 0, 1, 1), end = c(0, 1, 3, 0, 3), start = c(2, 
    3, 0, 1, 0)), class = "data.frame", row.names = c(NA, -5L))
    

    【讨论】:

      【解决方案2】:
      library(data.table)
      setDT(df)
      
      dcast(melt(df, 'id'), 
            id ~ paste0(variable, '_', value), 
            fun.aggregate = length)
      
      
      #    id end_0 end_1 end_3 start_0 start_1 start_2 start_3 stock1_0
      # 1:  1     1     0     0       0       0       1       0        0
      # 2:  2     0     1     0       0       0       0       1        0
      # 3:  3     0     0     1       1       0       0       0        1
      # 4:  4     1     0     0       0       1       0       0        0
      # 5:  5     0     0     1       1       0       0       0        0
      #    stock1_1 stock1_2 stock2_0 stock2_1
      # 1:        1        0        1        0
      # 2:        0        1        0        1
      # 3:        0        0        1        0
      # 4:        1        0        0        1
      # 5:        0        1        0        1
      

      【讨论】:

        【解决方案3】:

        一种方法是获取长格式数据,将列名与值结合起来,然后以宽格式获取数据。

        library(dplyr)
        library(tidyr)
        
        df %>%
          pivot_longer(cols = -id) %>%
          unite(name, name, value) %>%
          mutate(value = 1) %>%
          pivot_wider(values_fill = list(value = 0)) 
        
        # A tibble: 5 x 13
        #     id stock1_1 stock2_0 end_0 start_2 stock1_2 stock2_1 end_1 start_3 stock1_0 end_3 start_0 start_1
        #  <dbl>    <dbl>    <dbl> <dbl>   <dbl>    <dbl>    <dbl> <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
        #1     1        1        1     1       1        0        0     0       0        0     0       0       0
        #2     2        0        0     0       0        1        1     1       1        0     0       0       0
        #3     3        0        1     0       0        0        0     0       0        1     1       1       0
        #4     4        1        0     1       0        0        1     0       0        0     0       0       1
        #5     5        0        0     0       0        1        1     0       0        0     1       1       0
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 2023-02-25
          • 1970-01-01
          • 2018-01-31
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多