比较数据框以提取特定值答案

【问题标题】：Comparing data frames to extract specific values比较数据框以提取特定值
【发布时间】：2018-08-04 16:09:21
【问题描述】：

我有两个数据框：

df <- data.frame(Group = c("A","B","C","D","E","F"),
             Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"))

df2 <- data.frame(Group = c("A","A","B","B","C","C","C","D","D","E","E","F","F"),
              Date = c("2018-04-12 08:56:00","2018-04-12 10:42:00","2018-04-13 10:03:00","2018-04-13 11:21:00","2018-04-14 08:17:00","2018-04-14 10:32:00","2018-04-14 22:44:00","2018-04-15 03:10:00","2018-04-15 11:17:00","2018-04-16 16:56:00","2018-04-16 20:01:00","2018-04-17 11:15:00","2018-04-17 11:20:00"))

我想做两件事。首先，按组，我想将 df 中的 Date 列与 df2 中的列进行比较，并提取完全匹配的 datesDate ，或者如果没有完全匹配，则从 df2 中提取最接近和之前的 Date df 中的日期。

其次，按组，我想将 df 中的 Date 列与 df2 中的列进行比较，如果完全匹配，则提取 Date，如果没有完全匹配，则从 df2 中提取最近的 Date，不管它是否在 df 中的日期之前。

所以这个例子的结果应该像下面这样：

result <- data.frame(Group = c("A","B","C","D","E","F"),
                 Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"),
                 Return1 = c("2018-04-12 08:56:00","2018-04-13 10:03:00","2018-04-14 10:32:00","2018-04-15 03:10:00",NA,"2018-04-17 11:15:00"),
                 Return2 = c("2018-04-12 08:56:00","2018-04-13 11:21:00","2018-04-14 10:32:00","2018-04-15 03:10:00","2018-04-16 16:56:00","2018-04-17 11:15:00"))

【问题讨论】：

重新创建结果的代码不起作用（组的元素比其他列多）。另外，您尝试过什么来解决您的问题？
啊。谢谢你的提醒。我刚刚更新了结果

标签： r date merge posixct

【解决方案1】：

这就是我认为您正在寻找的东西。

library(dplyr)
library(purrr)
library(lubridate)
library(data.table)

df <- df %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS"))
df2 <- df2 %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS")) %>% mutate(Result1 = Date)
df3 <- df2 %>% rename(Result2 = Result1)

setDT(df)
setDT(df2)
setDT(df3)

setkey(df,Group, Date)
setkey(df2,Group, Date)
setkey(df3,Group, Date)

list(df2[df, roll = Inf], df3[df, roll = "nearest"]) %>% 
    reduce(full_join, by = c("Group", "Date"))

#   Group                Date             Result1             Result2
# 1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
# 2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
# 3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
# 4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
# 5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
# 6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00

【讨论】：

【解决方案2】：

这是仅使用基础 R 的可能解决方案：

# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# create a matrix m[df rows, df2 rows] with the difference between 
# the dates (df$Date - df2Date); where df row group != df2 row group set NA
m <- outer(1:nrow(df),1:nrow(df2),
           function(i,j){ ifelse(df$Group[i]!=df2$Group[j],NA,d1[i]-d2[j]) } )

# copy df into res
res <- df
# compute Return1 using matrix m 
# (for each row choose the first having minimum difference>= 0)
res$Return1 <- apply(m,1,function(r){o=order(r);df2$Date[o[r[o]>=0][1]]})
# compute Return2 using matrix m 
# (for each row choose the first having the min absolute difference)
res$Return2 <- apply(m,1,function(r)df2$Date[order(abs(r))[1]])

> res
  Group                Date             Return1             Return2
1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00

这是另一种可能的解决方案（始终使用基础 R）可能更有效：

# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# split the row-indexes of df2 into a list of indexes by Group
df2splits <- split(1:nrow(df2),df2$Group)
# for each row of df, save the index of df2split list corresponding to the same Group
splitIdxs <- match(df$Group,names(df2splits))
# compute Return1 using sapply and the previously created structures
res$Return1 <- sapply(1:nrow(df),
               function(i){
                 idx <- df2splits[[splitIdxs[i]]]
                 differ <- d1[i] - d2[idx]
                 o=order(differ)
                 df2$Date[idx][o[differ[o]>=0][1]]
               })
# compute Return2 using sapply and the previously created structures
res$Return2 <- sapply(1:nrow(df),
               function(i){
                 idx <- df2splits[[splitIdxs[i]]]
                 differ <- d1[i] - d2[idx]
                 df2$Date[idx][order(abs(differ))[1]]
               })

> res
  Group                Date             Return1             Return2
1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00

【讨论】：