这里有一个解决您的问题的解决方案。该解决方案首先将 df1 和 df2 的 SURNAME 列分为两个姓氏,以检查单个匹配项(请参阅 df1_bis 和 df2_bis)。然后,它循环遍历 df2 的所有条目以检查在 df1 中是否不找到确切的 NAME,以及是否在 df1 中找到 df2 的每个条目的至少一个姓氏。如果满足这两个条件,它会检查这些条目的CITY 和STATE 是否在df1 和df2 中匹配。如果是这种情况,则将 familyDummy 分配为 1,如果不是,则分配为 0。
library(tidyverse)
# Your data
df1 <-structure(list(NAME = c("Maria Antonia Sousa", "Josep Oliveira Carlos",
"Jose Mario Augusto Farias", "Andre Gois Lucas"), CITY = c("A",
"A", "B", "B"), STATE = c("X", "X", "Y", "Y"), SURNAME = c("Antonia Sousa",
"Oliveira Carlos", "Augusto Farias", "Gois Lucas")), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(NAME = c("Maria Antonia Sousa", "Angela Oliveira Santos",
"Fabio Silva Carlos", "Luan Gois Lucas"), CITY = c("A", "A",
"B", "B"), STATE = c("X", "X", "Y", "Y"), SURNAME = c("Antonia Sousa",
"Oliveira Santos", "Silva Carlos", "Gois Lucas"), familyDummy = c(0L,
1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -4L))
# Divide surnames
df1_bis <- df1 %>%
# Divide SURNAME into two surnames to check independently for each single surname
mutate(surname1 = str_extract(SURNAME,"[A-z]+(?=\\s)"),
surname2 = str_extract(SURNAME,"(?<=\\s)[A-z]+"))
df2_bis <- df2 %>%
# Divide SURNAME into two surnames to check independently for each single surname
mutate(surname1 = str_extract(SURNAME,"[A-z]+(?=\\s)"),
surname2 = str_extract(SURNAME,"(?<=\\s)[A-z]+"))
df2 %>%
# Add the result as another column
# Use map to cycle over each row in df2
mutate(familyDummy = map(1:nrow(df2_bis), function(i){
# Check if the same NAME is in df1 and df2, if it appears assign 0, if not, 1.
dif_name = str_detect(df2_bis$NAME[i], df1_bis$NAME, negate = T)
# Check if any of the surnames of df1 is in df2. If it appears, assign 1, if not 0,
surname_same = ifelse(str_detect(df2_bis$surname1[i], df1_bis$surname1) | str_detect(df2_bis$surname1[i], df1_bis$surname2) | str_detect(df2_bis$surname2[i], df1_bis$surname1) | str_detect(df2_bis$surname2[i], df1_bis$surname2), 1, 0)
# Get the indices in df1 of the cases that meet the two latter criteria
temp <- which(dif_name == 1 & surname_same == 1)
# Check if there are cases where at least one entry matches the two criteria
if(length(temp) >= 1){
# Check if city and state in df1 matches that in df2
# I used %in% instead of == because there might be more than 1 match
familyDummy = ifelse(df2_bis$CITY[i] %in% df1_bis$CITY[temp] & df2_bis$STATE[i] %in% df1_bis$STATE[temp], 1, 0)
}else{ # If no case match the previous two criteria return 0
familyDummy = 0
}
return(familyDummy)
}))
# NAME CITY STATE SURNAME familyDummy
#1 Maria Antonia Sousa A X Antonia Sousa 0
#2 Angela Oliveira Santos A X Oliveira Santos 1
#3 Fabio Silva Carlos B Y Silva Carlos 0
#4 Luan Gois Lucas B Y Gois Lucas 1