数据
df1 <- structure(list(X1 = c("01.01.2000", "01.01.2001", "01.01.2002"),
X2 = 4:6),
class = "data.frame",
row.names = c(NA, -3L))
df2 <- structure(list(X1 = c("01.01.2002", "01.01.2003", "01.01.2004"),
X2 = 8:10),
class = "data.frame",
row.names = c(NA, -3L))
代码
library(dplyr)
full_join(df1, df2, by = "X1") %>%
mutate(X2 = case_when(!is.na(X2.x) & !is.na(X2.y) ~ X2.y,
is.na(X2.y) ~ X2.x,
is.na(X2.x) ~ X2.y)) %>%
select(X1, X2)
说明
- 首先,您对两个数据集执行
full_join,并将X1 作为连接列。这将创建列 X2.x 和 X2.y,它们将承载各自数据集的 X2 值。
- 那么只需简单地应用
mutate 即可根据您给出的规则选择正确的列。
基准测试
distinct 解决方案无论如何都比3 快,如以下基准所示:
library(tidyverse)
library(microbenchmark)
make_data_frame <- function(n, percent_matching = .1) {
ids_a <- ids_b <- paste0("ID_", seq.int(n))
non_matching_ids <- sample(n, round(n * (1 - percent_matching), 0))
ids_b[non_matching_ids] <- paste(ids_b[non_matching_ids], "b", sep = "_")
list(A = data.frame(X1 = ids_a, X2 = "a", stringsAsFactors = FALSE),
B = data.frame(X1 = ids_b, X2 = "b", stringsAsFactors = FALSE))
}
.distinct <- function(dfs) {
bind_rows(dfs$B, dfs$A) %>%
distinct(X1, .keep_all = TRUE)
}
.join <- function(dfs) {
full_join(dfs$A, dfs$B, by = "X1") %>%
mutate(X2 = case_when(!is.na(X2.x) & !is.na(X2.y) ~ X2.y,
is.na(X2.y) ~ X2.x,
is.na(X2.x) ~ X2.y))
}
scenarios <- expand.grid(n = c(1e4, 1e5, 1e6),
percent_matching = c(.1, .5, .9))
all_data <- pmap(scenarios, make_data_frame)
all_mb <- map(all_data, ~ microbenchmark(.distinct(.x), .join(.x)))
map_dfr(seq.int(NROW(scenarios)), function(i) {
mdat <- scenarios[i, ]
my_summary <- summary(all_mb[[i]])
rownames(mdat) <- NULL
rownames(my_summary) <- NULL
cbind(mdat, my_summary)
}) %>%
select(n, percent_matching, expr, mean)
# n percent_matching expr mean
# 1 1e+04 0.1 .distinct(.x) 4.975013
# 2 1e+04 0.1 .join(.x) 12.587072
# 3 1e+05 0.1 .distinct(.x) 59.577142
# 4 1e+05 0.1 .join(.x) 149.987451
# 5 1e+06 0.1 .distinct(.x) 1.158597
# 6 1e+06 0.1 .join(.x) 2.699003
# 7 1e+04 0.5 .distinct(.x) 4.485196
# 8 1e+04 0.5 .join(.x) 11.902656
# 9 1e+05 0.5 .distinct(.x) 46.660016
# 10 1e+05 0.5 .join(.x) 132.180758
# 11 1e+06 0.5 .distinct(.x) 913.503111
# 12 1e+06 0.5 .join(.x) 2148.531600
# 13 1e+04 0.9 .distinct(.x) 4.299905
# 14 1e+04 0.9 .join(.x) 12.731292
# 15 1e+05 0.9 .distinct(.x) 37.558069
# 16 1e+05 0.9 .join(.x) 111.428117
# 17 1e+06 0.9 .distinct(.x) 458.030035
# 18 1e+06 0.9 .join(.x) 1458.408847