【问题标题】:Get the No_intersection/Complementary part of several date's intervals获取多个日期间隔的 No_intersection/Complementary 部分
【发布时间】:2019-01-11 15:31:09
【问题描述】:

我想得到 2017 年几个日期间隔的缺失部分。

例如,以下数据帧的每个“id”:

df <- data.frame(id=c(rep("a",3),rep("b",2)),
                 start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
                 end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))

id    start        end 
a     2017-01-01   2017-01-15 
a     2017-01-10   2017-01-20
a     2017-02-10   2017-02-20
b     2017-03-01   2017-03-28
b     2017-04-20   2017-04-29

我想得到:

df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
                       start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
                       end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))

id    start        end
a     2017-01-21   2017-02-09
a     2017-02-21   2017-12-31
b     2017-01-01   2017-02-28
b     2017-03-29   2017-04-19
b     2017-04-30   2017-12-31

谢谢!

【问题讨论】:

    标签: r


    【解决方案1】:

    首先确认startend是否为Date类。

    df$start <- as.Date(df$start)
    df$end <- as.Date(df$end)
    

    使用by()将数据根据ids拆分成两个数据框的列表。

    library(purrr)
    
    by(df, df$id, function(x){
      year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
      ind <- map2(x$start, x$end, function(start, end){
          which(year < start | year > end)
      }) %>% reduce(intersect)
      gap <- which(diff(ind) > 1)
      head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
      return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
    }) %>% reduce(rbind)
    

    说明:

    • 年份: 2017 年的所有日子。
    • ind : 沿行去掉startend 之间的日期,结果代表缺失日期的索引。
    • gap : 不连续的索引。

    输出:

    #   id      start        end
    # 1  a 2017-01-21 2017-02-09
    # 2  a 2017-02-21 2017-12-31
    # 3  b 2017-01-01 2017-02-28
    # 4  b 2017-03-29 2017-04-19
    # 5  b 2017-04-30 2017-12-31
    

    我认为我的解决方案仍然很麻烦。希望能帮到你。

    【讨论】:

      【解决方案2】:

      我最近遇到了类似的问题,我发现扩展表格以获取每个相关日期的一行,然后再折叠回范围,比尝试仅从范围端点计算出正确的逻辑更容易。

      该方法的工作原理如下。或者,也可以执行 thisthis 之类的操作,但这些方法没有您正在处理的“不在范围内”问题。

      library(dplyr)
      library(fuzzyjoin)
      library(lubridate)
      
      df <- data.frame(id=c(rep("a",3),rep("b",2)),
                       start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
                       end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
      
      # All the dates in 2017.
      all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
      
      # Start by expanding the original dataframe so that we get one record for each
      # id for each date in any of that id's ranges.
      df.expanded = df %>%
        # Convert the strings to real dates.
        mutate(start.date = as.Date(start),
               end.date = as.Date(end)) %>%
        # Left join to 2017 dates on dates that are in the range of this record.
        fuzzy_left_join(all.2017.dates,
                        by = c("start.date" = "date", "end.date" = "date"),
                        match_fun = list(`<=`, `>=`)) %>%
        # Filter to distinct ids/dates.
        select(id, date) %>%
        distinct()
      
      # Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
      # down to ranges.
      df.final = expand.grid(id = unique(df$id),
                             date = all.2017.dates$date) %>%
        # Anti-join on id and date.
        anti_join(df.expanded,
                  by = c("id", "date")) %>%
        # Sort by id, then date, so that the lead/lag functions behave as expected.
        arrange(id, date) %>%
        # Check whether this record is an endpoint (i.e., is it adjacent to the
        # previous/next record?).
        mutate(prev.day.included = coalesce(date == lag(date) + 1 &
                                              id == lag(id), F),
               next.day.included = coalesce(date == lead(date) - 1 &
                                              id == lag(id), F)) %>%
        # Filter to just endpoint records.
        filter(!prev.day.included | !next.day.included) %>%
        # Fill in both start and end dates on "start" records.  The start date is the
        # date in the record; the end date is the date of the next record.
        mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
                                    origin = lubridate::origin),
               end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
                                  origin = lubridate::origin)) %>%
        filter(!is.na(start.date))
      

      【讨论】:

        【解决方案3】:

        这是我的解决方案:

        library(tidyverse)
        library(lubridate)
        library(wrapr)
        
        df %>%
          mutate_at(2:3, ymd) %>%
          group_by(id) %>%
          gather('start_end', 'date', start:end) %>%
          mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
          unique() %>%
          mutate(
            start = if_else(
              start_end == 'start',
              date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
              date
            ),
            end = if_else(
              start_end == 'end',
              date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
              date
          )) %>%
          filter(start != end) %>%
          select(id, start, end) %>%
          mutate(supp = TRUE) %>%
          bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
          arrange(id, start) %>%
          mutate(rn = row_number()) %.>%
          left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
          na.omit() %>%
          mutate(
            start = case_when(
              (start.y >= end.x) & !supp.x ~ end.x + 1,
              (start.y >= end.x) &  supp.x ~ start.x,
              TRUE ~ as.Date(NA)
            ),
            end = case_when(
              (start.y >= end.x) &  supp.y ~ end.y,
              (start.y >= end.x) & !supp.y ~ start.y - 1,
              TRUE ~ as.Date(NA)
            )
          ) %>%
          select(id, start, end) %>%
          na.omit()
        

        【讨论】:

          猜你喜欢
          • 2021-10-13
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多