【问题标题】:full_join adding extra rows and NA in rfull_join 在 r 中添加额外的行和 NA
【发布时间】:2020-11-25 09:57:42
【问题描述】:

我尝试使用 full_join 加入两个数据框,这是我的数据的一个子集:

df1 <- structure(list(Team = structure(c(4L, 3L, 5L, 6L, 7L, 7L, 8L, 
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 14L, 13L, 15L, 15L, 
16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 22L, 23L, 
23L, 24L, 24L, 25L, 25L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 
32L, 32L, 33L, 33L, 34L, 34L, 2L, 1L, 26L, 27L), .Label = c("76ers", 
"76ers ", "Bucks", "Bucks ", "Bull ", "Bulls ", "Cavaliers ", 
"Celtics ", "Clippers ", "Grizzlies ", "Hawks ", "Heat ", "Hornets", 
"Hornets ", "Jazz ", "Kings ", "Knicks ", "Lakers ", "Magic ", 
"Mavericks ", "Net ", "Nets ", "Nuggets ", "Pacers ", "Pelicans ", 
"Pistons", "Pistons ", "Raptors ", "Rockets ", "Spurs ", "Thunder ", 
"Timberwolves ", "Warriors ", "Wizards "), class = "factor"), 
    Injury.Count = c(3L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 1L, 
    0L, 2L, 1L, 0L, 5L, 4L, 3L, 2L, 3L, 0L, 3L, 3L, 4L, 6L, 5L, 
    0L, 2L, 2L, 1L, 2L, 0L, 1L, 3L, 4L, 2L, 6L, 2L, 1L, 1L, 1L, 
    3L, 3L, 4L, 5L, 1L, 6L, 4L, 2L, 0L, 2L, 2L, 1L, 5L, 6L, 1L, 
    1L), HomevsAway = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c(NA, 
-56L), class = "data.frame")

df2 <- structure(list(Team = structure(c(1L, 1L, 2L, 2L, 3L, 4L, 4L, 
5L, 6L, 7L, 8L, 9L, 9L, 10L, 10L, 11L, 12L, 12L, 13L, 13L, 14L, 
15L, 15L, 16L, 16L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 
22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 28L, 28L, 
3L, 5L, 6L, 7L, 8L, 11L, 14L, 17L, 27L), .Label = c("76ers", 
"Bucks", "Bulls", "Cavaliers", "Celtics", "Clippers", "Grizzlies", 
"Hawks", "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", 
"Magic", "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", 
"Pistons", "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves", 
"Warriors", "Wizards"), class = "factor"), HomevsAway = structure(c(1L, 
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
    t_1 = c(55.883, 140.1, 32.2, 37.967, 29.85, 24.317, 57.316, 
    17.967, 19.05, 36.95, 16.167, 95.317, 86.533, 21.334, 52.567, 
    40.75, 28.3, 68.15, 97.067, 102.233, 26.866, 71.033, 34.467, 
    24.233, 42.033, 22.433, 59.033, 41.516, 12.7, 107.996, 6.5, 
    32.783, 0, 23.217, 13.93, 0, 54.88, 23.617, 83.834, 106.794, 
    17.56, 27.76, 85.83, 0.017, 35.183, 22.467, 25.033, 0, 0, 
    0, 0, 0, 0, 0, 0, 0), t_3 = c(197.3164, 388.6827, 126.2663, 
    111.916, 61.95, 91.55, 167.067, 104.083, 71.067, 135.383, 
    45.633, 261.317, 267.399, 114.6997, 159.2, 152.034, 84.8337, 
    204.3003, 351.449, 376.317, 86.333, 213.9, 99.767, 65.1, 
    131.767, 73.317, 126.416, 129.066, 73.383, 347.0994, 4761, 
    113.367, 0, 89.933, 59.8, 0, 188.983, 124.384, 215.666, 289.9667, 
    92, 144.2497, 254.083, 32.0333, 122.1837, 102.533, 82.817, 
    0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -56L), groups = structure(list(
    Team = structure(1:28, .Label = c("76ers", "Bucks", "Bulls", 
    "Cavaliers", "Celtics", "Clippers", "Grizzlies", "Hawks", 
    "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", "Magic", 
    "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", "Pistons", 
    "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves", 
    "Warriors", "Wizards"), class = "factor"), .rows = structure(list(
        1:2, 3:4, c(5L, 48L), 6:7, c(8L, 49L), c(9L, 50L), c(10L, 
        51L), c(11L, 52L), 12:13, 14:15, c(16L, 53L), 17:18, 
        19:20, c(21L, 54L), 22:23, 24:25, c(26L, 55L), 27:28, 
        29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42, 43:44, 
        c(45L, 56L), 46:47), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, 28L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我已尝试使用 full_join 加入数据:

LR <- full_join(df1, df2, by = c("HomevsAway", "Team"))

我收到的 LR 的输出给了我额外的行,其中涉及随机 NA。我的预期输出应该是 56rowx5col 表。

【问题讨论】:

  • @Cettt 知道为什么我的实际数据会发生这种情况吗?
  • @Cettt 有没有可以将数据从 R 复制/粘贴到 stackoverflow 的功能?
  • dput。使用dput(mydata) 并将结果发布到问题中。
  • @Cettt 让我知道这是否有效,已在上面更新

标签: r dataframe join dplyr merge


【解决方案1】:

问题在于,在您的第一个 data.frame 中,团队名称有一个尾随空格。这意味着字符串"Bucks" 不是字符串"Bucks "。 这两个字符串不能连接。

以下是修复数据的方法。首先使用 sub 删除前导空格,然后将 Team 转换为字符向量。然后 full_join 按计划工作:

df1_new <- df1 %>% as_tibble() %>%
  mutate(Team = sub(" +", "", as.character(Team)))
df2_new <- df2 %>%
  mutate(Team = as.character(Team))

df1_new %>% full_join(df2_new, by = c("Team", "HomevsAway"))

# A tibble: 58 x 5
   Team      Injury.Count HomevsAway   t_1   t_3
   <chr>            <int> <fct>      <dbl> <dbl>
 1 Bucks                3 0           32.2 126. 
 2 Bucks                3 1           38.0 112. 
 3 Bull                 1 0           NA    NA  
 4 Bulls                1 1            0     0  
 5 Cavaliers            1 0           24.3  91.6
 6 Cavaliers            2 1           57.3 167. 
 7 Celtics              0 0            0     0  
 8 Celtics              2 1           18.0 104. 
 9 Clippers             1 0            0     0  
10 Clippers             1 1           19.0  71.1

请注意,仍有一些 NA。这是由于一些拼写错误:Bull vs Bulls 和 Net vs Nets。

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2015-06-24
    • 2021-07-12
    • 1970-01-01
    • 1970-01-01
    • 2016-08-09
    • 1970-01-01
    • 2021-06-24
    • 1970-01-01
    相关资源
    最近更新 更多