【问题标题】:Merging values from wide data frame into long data frame将宽数据帧中的值合并到长数据帧中
【发布时间】:2019-07-05 17:01:14
【问题描述】:

我有两个数据框,一宽一长:

    long_df = structure(list(PID = c(1001, 1001, 1001, 1002, 1002, 1002, 1002, 
1003), scan_name = c("01_001A", "01_001B", "01_001C", "01_002A", 
"01_002B", "01_002D", "01_002E", "01_003B")), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))

wide_df = structure(list(PID = c(1001, 1002, 1003), scan_name_1 = c("01_001A", 
"01_002A", NA), scan_date_1 = structure(c(1206748800, 1240876800, 
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), scan_name_2 = c("01_001B", 
"01_002B", "01_003B"), scan_date_2 = structure(c(1238544000, 
1272672000, 1424736000), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    scan_name_3 = c("01_001C", NA, NA), scan_date_3 = structure(c(1301702400, 
    NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    scan_name_4 = c(NA, "01_002D", NA), scan_date_4 = structure(c(NA, 
    1400112000, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    scan_name_5 = c(NA, "01_002E", NA), scan_date_5 = structure(c(NA, 
    1430438400, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, 
-3L), class = c("tbl_df", "tbl", "data.frame"))

我正在尝试将值“scan_date_1”、“scan_date_2”等从wide_df 获取到long_df。

我试图得到的输出如下所示:

goal_df = structure(list(PID = c(1001, 1001, 1001, 1002, 1002, 1002, 1002, 
1003), scan_name = c("01_001A", "01_001B", "01_001C", "01_002A", 
"01_002B", "01_002D", "01_002E", "01_003B"), scan_date = structure(c(1206748800, 
1238544000, 1301702400, 1240876800, 1272672000, 1400112000, 1430438400, 
1424736000), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))

看起来很简单,但我使用合并/融化/等的尝试都没有成功。非常感谢任何和所有帮助! (第一次使用“dput”,所以希望这是一个可复制的例子)

【问题讨论】:

  • 我认为你需要通过 'PID' 加入
  • 你能发布预期的输出吗?
  • 我已经编辑了我的帖子以添加目标输出,谢谢@RuiBarradas

标签: r dplyr


【解决方案1】:

这可以通过 data.table 的 melt 函数及其 patterns 参数有效地完成:

library(data.table)

setDT(wide_df)

goal_df <- melt(wide_df, measure = patterns("scan_name", "scan_date"), value.name = c("scan_name", "scan_date"))
goal_df <- na.omit(goal_df)

goal_df[, variable := NULL][]
#>     PID scan_name  scan_date
#> 1: 1001   01_001A 2008-03-29
#> 2: 1002   01_002A 2009-04-28
#> 3: 1001   01_001B 2009-04-01
#> 4: 1002   01_002B 2010-05-01
#> 5: 1003   01_003B 2015-02-24
#> 6: 1001   01_001C 2011-04-02
#> 7: 1002   01_002D 2014-05-15
#> 8: 1002   01_002E 2015-05-01

注意:tidyr 的开发版本 (GitHub) 包含具有相似功能的新功能 pivot_longerpivot_wider,请参阅 Tidyr-Pivoting。使用pivot_longer 你可以这样做:

library(dplyr)
library(tidyr)

mutate_at(wide_df, .vars = vars(starts_with("scan")), as.character) %>%
    pivot_longer(-PID, 
        names_to = c(".value", "id"),
        names_pattern = "(scan_date|scan_name)_(.)", 
        values_drop_na = TRUE
    ) %>%
    select(-id)
#> # A tibble: 8 x 3
#>     PID scan_name scan_date 
#>   <dbl> <chr>     <chr>     
#> 1  1001 01_001A   2008-03-29
#> 2  1001 01_001B   2009-04-01
#> 3  1001 01_001C   2011-04-02
#> 4  1002 01_002A   2009-04-28
#> 5  1002 01_002B   2010-05-01
#> 6  1002 01_002D   2014-05-15
#> 7  1002 01_002E   2015-05-01
#> 8  1003 01_003B   2015-02-24

packageVersion("tidyr")
#> ‘0.8.3.9000’

【讨论】:

    【解决方案2】:

    此代码将wide_df 加长,其中每一行按事件代表一个唯一的人。

    library(magrittr)
    pattern <- "^scan_(date|name)_(\\d+)$"
    
    wide_df %>% 
      dplyr::mutate_all(as.character) %>% 
      tidyr::gather(key="key", value="value", -PID) %>% 
      dplyr::mutate(
        event_id  = as.integer(sub(pattern, "\\2", key)),
        key       = sub(pattern, "\\1", key)
      ) %>% 
      tidyr::spread(key=key, value=value) %>%
      dplyr::mutate(
        date  = as.Date(date)
      )
    

    结果:

    # A tibble: 15 x 4
       PID   event_id date       name   
       <chr>    <int> <date>     <chr>  
     1 1001         1 2008-03-29 01_001A
     2 1001         2 2009-04-01 01_001B
     3 1001         3 2011-04-02 01_001C
     4 1001         4 NA         NA     
     5 1001         5 NA         NA     
     6 1002         1 2009-04-28 01_002A
     7 1002         2 2010-05-01 01_002B
     8 1002         3 NA         NA     
     9 1002         4 2014-05-15 01_002D
    10 1002         5 2015-05-01 01_002E
    11 1003         1 NA         NA     
    12 1003         2 2015-02-24 01_003B
    13 1003         3 NA         NA     
    14 1003         4 NA         NA     
    15 1003         5 NA         NA     
    

    【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2017-07-19
    • 2021-12-01
    • 2021-12-21
    • 1970-01-01
    • 2015-10-11
    • 1970-01-01
    • 2013-06-14
    相关资源
    最近更新 更多