【问题标题】:mix date periods in two dataframe在两个数据框中混合日期期间
【发布时间】:2019-12-12 17:55:37
【问题描述】:

我有一个包含客户、他们入住的酒店以及酒店的到达和离开日期的数据框。

client<-data.frame( id = 1:5, 
                    arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
                    departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
                    reg_com= c(12654, 12657, 12666, 12589, 12546)) 

client[, 2] <-as.Date(client[, 2])
client[, 3] <-as.Date(client[, 3])

# I don't care about the years
client[, 2]<-format(client[, 2], format="%m-%d")
client[, 3]<-format(client[, 3], format="%m-%d")

还有一个包含酒店和开业日期的数据框。开业日期的年份对我来说并不重要,因为酒店每年都在同一天开业。 X1O 和 X1C 是指酒店营业的第一期,X2O 和 X2C 是指酒店营业的第二期。 即酒店营业= [X1O, X1C] and [X2O, X2C] and [X3O, X3C]。

hotel_periodes<- data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
                  x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
                  X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
                  X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
                  X2C = c(NA, "2019-09-30", NA, "2019-02-30","2019-11-02"),
                  X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
                  X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30")
                  )
hotel_periodes[, c(2)]<-as.Date(hotel_periodes[, c(2)])
hotel_periodes[, c(3)]<-as.Date(hotel_periodes[, c(3)])
hotel_periodes[, c(4)]<-as.Date(hotel_periodes[, c(4)])
hotel_periodes[, c(5)]<-as.Date(hotel_periodes[, c(5)])
hotel_periodes[, c(6)]<-as.Date(hotel_periodes[, c(6)])
hotel_periodes[, c(7)]<-as.Date(hotel_periodes[, c(7)])

# I don't care about year
hotel_periodes[, c(2)]<-format(hotel_periodes[, c(2)], format="%m-%d")
hotel_periodes[, c(3)]<-format(hotel_periodes[, c(3)], format="%m-%d")
hotel_periodes[, c(4)]<-format(hotel_periodes[, c(4)], format="%m-%d")
hotel_periodes[, c(5)]<-format(hotel_periodes[, c(5)], format="%m-%d")
hotel_periodes[, c(6)]<-format(hotel_periodes[, c(6)], format="%m-%d")
hotel_periodes[, c(7)]<-format(hotel_periodes[, c(7)], format="%m-%d")

我想知道客户入住酒店的时间是开门还是关门。


result<-data.frame( id = 1:5, 
                    arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
                    departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
                    reg_com= c(12654, 12657, 12666, 12589, 12546), 
                    OPEN_HOTEL= c(FALSE, TRUE, FALSE, FALSE, TRUE )
                    ) 

【问题讨论】:

    标签: r date dataframe as.date


    【解决方案1】:

    这终于对我有用了:

    y<- merge(client, hotel_periodes, by.x="reg_com", all.x=TRUE)
    
    
    is.between<-function(x, a, b) {
     for( i in 1:length(a)){
       if(a[i]>b[i]){
        year(a[i])<-2018
      }}
      (x > a) & (b > x) 
      return((x > a) & (b > x) )
    } 
    
    
    y$arrive<- as.Date(y$arrive, '%m-%d')
    y$departure<- as.Date(y$departure, '%m-%d')
    
    y$x1O<- as.Date(y$x1O, '%m-%d')
    
    y$X1C<- as.Date(y$X1C, '%m-%d')
    
    y$X2O<- as.Date(y$X2O, '%m-%d')
    
    y$X2C<- as.Date(y$X2C, '%m-%d')
    
    y$X3O<- as.Date(y$X3O, '%m-%d')
    
    y$X3C<- as.Date(y$X3C, '%m-%d')
    
    y[is.na(y[, c(5:10)])]<-"1999-12-12"
    y$x1O[is.na(y$x1O)]<-"1999-12-12"
    y$X1C[is.na(y$X1C)]<-"1999-12-12"
    y$X2O[is.na(y$X2O)]<-"1999-12-12"
    y$X2C[is.na(y$X2C)]<-"1999-12-12"
    y$X3O[is.na(y$X3O)]<-"1999-12-12"
    y$X3C[is.na(y$X3C)]<-"1999-12-12"
    
    y[,"correct" ]<- is.between(y$arrive , y$x1O , y$X1C) | is.between(y$arrive ,  y$X2O, y$X2C)  |is.between(y$arrive ,  y$X3O, y$X3C) | is.between(y$departure, y$x1O , y$X1C) | is.between(y$departure ,  y$X2O, y$X2C)  |is.between(y$departure ,  y$X3O, y$X3C)
    
    
    

    【讨论】:

      【解决方案2】:

      一种可能的解决方案如下。首先,我以正确的格式准备 hotel_periods 数据,将其从宽转换为长。

      library(tidyverse)
      library(lubridate)
      
      hotel_periodes <- hotel_periodes %>% 
        gather(period, times, -reg_com) %>% 
        mutate(oc = str_extract(period, "[OC]"), 
               oc = if_else(oc == "O", "open", "close"), 
               period = as.numeric(str_extract(period, "\\d")), 
               times = as.Date(times)) %>% 
        spread(oc, times) %>% 
        filter(!is.na(open)) %>% 
        select(reg_com, period, open, close)
      
      hotel_periodes
      #    reg_com period       open      close
      # 1    12546      1 2019-04-01 2019-06-01
      # 2    12546      2 2019-07-01 2019-11-02
      # 3    12546      3 2019-12-01 2019-12-30
      # 4    12589      1 2018-12-01 2019-01-01
      # 5    12589      2 2019-02-20 2019-02-28
      # 6    12589      3 2019-06-20 2019-11-01
      # 7    12654      1 2018-12-01 2019-04-01
      # 8    12657      1 2019-03-04 2019-05-04
      # 9    12657      2 2019-06-30 2019-09-30
      # 10   12657      3 2019-12-01 2019-01-30
      # 11   12666      1 2019-04-30 2019-12-31
      

      然后我将这个 data.frame 与客户数据合并。你说年份对你来说并不重要。然而,我们需要正确的年份来比较日期。正如你所说,多年来开放时间保持不变我做了一个小技巧,将关闭年份设置为出发年份,并将开放年份设置为出发年份或前一年(以捕捉酒店开放的日期从十二月到一月)。 然后我比较打开、关闭、到达和离开日期,如果到达和离开位于打开和关闭之间,则返回 TRUE。最后,我总结了每个客户、酒店以及到达和离开日期的结果。

      client <- client %>% 
        mutate(arrive = as.Date(arrive), 
               departure = as.Date(departure))
      
      left_join(client, hotel_periodes, by = "reg_com") %>% 
        mutate(close = `year<-`(close, year(departure)),
               open = if_else(`year<-`(open, year(departure)) <= close, 
                              `year<-`(open, year(departure)), 
                              `year<-`(open, year(departure) - 1)),
               between = open <= arrive & departure <= close) %>% 
        group_by(id, arrive, departure, reg_com) %>% 
        summarize(OPEN_HOTEL = any(between))
      
      # A tibble: 5 x 5
      # Groups:   id, arrive, departure [5]
      #      id arrive     departure  reg_com OPEN_HOTEL
      #   <int> <date>     <date>       <dbl> <lgl>  
      # 1     1 2019-05-01 2019-05-31   12654 FALSE  
      # 2     2 2018-01-03 2018-01-21   12657 TRUE   
      # 3     3 2019-04-05 2019-04-25   12666 FALSE  
      # 4     4 2015-05-03 2015-05-13   12589 FALSE  
      # 5     5 2017-12-02 2017-12-30   12546 TRUE 
      

      数据

      注意:我手动将日期 2019-02-30 更改为 2019-02-28,因为这不是问题的主要焦点。然而,在合并 data.frames 之前有必要验证日期。

      client <- 
        data.frame(id = 1:5, 
                   arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
                   departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
                   reg_com= c(12654, 12657, 12666, 12589, 12546), 
                   stringsAsFactors = FALSE) 
      
      hotel_periodes <- 
        data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
                   x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
                   X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
                   X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
                   X2C = c(NA, "2019-09-30", NA, "2019-02-28","2019-11-02"),
                   X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
                   X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30"), 
                   stringsAsFactors = FALSE)
      

      【讨论】:

        猜你喜欢
        • 2019-08-08
        • 2019-08-18
        • 1970-01-01
        • 2020-04-20
        • 2021-11-29
        • 2021-10-24
        • 2019-08-18
        • 2021-09-14
        • 2019-05-27
        相关资源
        最近更新 更多