【问题标题】:Splitting/Separating a Character Column into Multiple Columns with specified names & formats将字符列拆分/分隔为具有特定名称和格式的多列
【发布时间】:2021-02-24 15:59:50
【问题描述】:

我一直在寻找如何使用stringr::str_splittidyr::separate 来完成我正在寻找的事情的示例。我不确定做我正在寻找的最有效的方法。我基本上希望在下面的示例中使用data 中列出的 2 列来创建下面的output_df 中显示的 4 列。

对于output_df:前3列来自base_out_state列,第4列是data中的avg_reoutput_df 中的所有列都应该是数字。前 3 列的列名具有前缀 on_

我猜这个解决方案也会使用purrr,但我没有成功。

data <- tibble::tribble(
       ~base_out_state,           ~avg_re,
  "0  outs,  1b 2b 3b",  2.53237410071942,
   "0  outs,  _ 2b 3b",  1.95045045045045,
  "1  outs,  1b 2b 3b",  1.73913043478261,
   "0  outs,  1b 2b _",  1.60282021151586,
   "0  outs,  1b _ 3b",  1.59868421052632,
   "1  outs,  _ 2b 3b",  1.47916666666667,
    "0  outs,  _ _ 3b",  1.42028985507246,
   "1  outs,  1b _ 3b",  1.27450980392157,
    "0  outs,  _ 2b _",  1.11675126903553,
    "1  outs,  _ _ 3b", 0.960416666666667,
   "1  outs,  1b 2b _", 0.939353099730458,
    "0  outs,  1b _ _", 0.925538103548575,
  "2  outs,  1b 2b 3b", 0.740189445196211,
    "1  outs,  _ 2b _", 0.708523096942095,
    "1  outs,  1b _ _", 0.568587968789328,
   "2  outs,  _ 2b 3b",  0.55668358714044,
     "0  outs,  _ _ _", 0.534048257372654,
   "2  outs,  1b _ 3b",           0.53125,
   "2  outs,  1b 2b _", 0.463123644251627,
    "2  outs,  _ _ 3b",              0.39,
    "2  outs,  _ 2b _", 0.324457593688363,
     "1  outs,  _ _ _", 0.286259541984733,
    "2  outs,  1b _ _", 0.230750721847931,
     "2  outs,  _ _ _", 0.104665825977301
  )



output_df<- tibble::tribble(
               ~outs_when_up, ~on_1b, ~on_2b, ~on_3b,           ~avg_re,
                           0,      1,      1,      1,  2.53237410071942,
                           0,      0,      1,      1,  1.95045045045045,
                           1,      1,      1,      1,  1.73913043478261,
                           0,      1,      1,      0,  1.60282021151586,
                           0,      1,      0,      1,  1.59868421052632,
                           1,      0,      1,      1,  1.47916666666667,
                           0,      0,      0,      1,  1.42028985507246,
                           1,      1,      0,      1,  1.27450980392157,
                           0,      0,      1,      0,  1.11675126903553,
                           1,      0,      0,      1, 0.960416666666667,
                           1,      1,      1,      0, 0.939353099730458,
                           0,      1,      0,      0, 0.925538103548575,
                           2,      1,      1,      1, 0.740189445196211,
                           1,      0,      1,      0, 0.708523096942095,
                           1,      1,      0,      0, 0.568587968789328,
                           2,      0,      1,      1,  0.55668358714044,
                           0,      0,      0,      0, 0.534048257372654,
                           2,      1,      0,      1,           0.53125,
                           2,      1,      1,      0, 0.463123644251627,
                           2,      0,      0,      1,              0.39,
                           2,      0,      1,      0, 0.324457593688363,
                           1,      0,      0,      0, 0.286259541984733,
                           2,      1,      0,      0, 0.230750721847931,
                           2,      0,      0,      0, 0.104665825977301
               )

【问题讨论】:

    标签: r tidyr purrr stringr


    【解决方案1】:

    怎么样:

    library(tidyverse)
    data %>% 
      separate(col = base_out_state, sep = "\\s+", into = c("outs_when_up", "outs", "on_1b", "on_2b", "on_3b")) %>% 
      select(-outs) %>% 
      mutate(across(starts_with("on"), .fns = ~ ifelse(.x == "_", 0L, 1L))) %>% 
      mutate(outs_when_up = as.integer(outs_when_up))
    #> # A tibble: 24 x 5
    #>    outs_when_up on_1b on_2b on_3b avg_re
    #>           <int> <int> <int> <int>  <dbl>
    #>  1            0     1     1     1  2.53 
    #>  2            0     0     1     1  1.95 
    #>  3            1     1     1     1  1.74 
    #>  4            0     1     1     0  1.60 
    #>  5            0     1     0     1  1.60 
    #>  6            1     0     1     1  1.48 
    #>  7            0     0     0     1  1.42 
    #>  8            1     1     0     1  1.27 
    #>  9            0     0     1     0  1.12 
    #> 10            1     0     0     1  0.960
    #> # … with 14 more rows
    

    reprex package (v1.0.0) 于 2021 年 2 月 24 日创建

    【讨论】:

      【解决方案2】:

      splitstackshape 也可以。

      library(splitstackshape)
      library(tidyverse)
      cSplit(data, splitCols = "base_out_state", sep = " ", direction = "wide", drop = FALSE) %>% 
        mutate(across(matches("state_3|state_4|state_5"), ~ifelse(.x == "_", 0, 1))) %>% 
        dplyr::select(-base_out_state, -base_out_state_2) %>% 
        rename(outs_when_up = base_out_state_1) %>% 
        rename_at(vars(matches("state")), list(~paste0("on_", c("1b", "2b", "3b"))))
      
      #        avg_re outs_when_up on_1b on_2b on_3b
      #  1: 2.5323741            0     1     1     1
      #  2: 1.9504505            0     0     1     1
      #  3: 1.7391304            1     1     1     1
      #  4: 1.6028202            0     1     1     0
      #  5: 1.5986842            0     1     0     1
      #  6: 1.4791667            1     0     1     1
      #  7: 1.4202899            0     0     0     1
      #  8: 1.2745098            1     1     0     1
      #  9: 1.1167513            0     0     1     0
      # 10: 0.9604167            1     0     0     1
      # 11: 0.9393531            1     1     1     0
      # 12: 0.9255381            0     1     0     0
      # 13: 0.7401894            2     1     1     1
      # 14: 0.7085231            1     0     1     0
      # 15: 0.5685880            1     1     0     0
      # 16: 0.5566836            2     0     1     1
      # 17: 0.5340483            0     0     0     0
      # 18: 0.5312500            2     1     0     1
      # 19: 0.4631236            2     1     1     0
      # 20: 0.3900000            2     0     0     1
      # 21: 0.3244576            2     0     1     0
      # 22: 0.2862595            1     0     0     0
      # 23: 0.2307507            2     1     0     0
      # 24: 0.1046658            2     0     0     0
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2018-09-22
        • 1970-01-01
        • 2022-01-18
        • 1970-01-01
        • 2019-12-14
        • 2018-03-17
        • 1970-01-01
        相关资源
        最近更新 更多