【问题标题】:making a new ID when duplicates are removed in R using dplyr使用 dplyr 在 R 中删除重复项时创建新 ID
【发布时间】:2022-01-17 14:58:17
【问题描述】:

这是我的数据集的一个子集:

> dput(df)
structure(list(ID = c(238L, 238L, 238L, 238L, 238L, 238L, 238L, 
238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 
238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 
238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 
238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L, 
238L, 238L, 238L, 238L, 238L, 238L, 238L, 238L), X = c(54.6775637888, 
54.9684018941, 54.9684018941, 55.2592399993, 55.2592399993, 55.8409162098, 
55.8409162098, 56.1317543151, 55.5500781046, 55.5500781046, 55.2592399993, 
59.6218115782, 56.1317543151, 56.4225924204, 56.4225924204, 56.4225924204, 
59.6218115782, 56.7134305256, 56.4225924204, 59.6218115782, 59.6218115782, 
56.7134305256, 59.6218115782, 57.5859448414, 57.8767829466, 59.6218115782, 
59.3309734729, 59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 
59.6218115782, 59.6218115782, 60.2034877887, 59.6218115782, 59.6218115782, 
59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 
59.9126496835, 59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 
59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 59.9126496835, 
59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 59.6218115782, 
59.6218115782, 59.6218115782, 59.6218115782), Y = c(177.411244208, 
179.447110945, 180.319625261, 180.901301471, 181.773815787, 182.355491998, 
182.937168208, 182.937168208, 183.809682524, 184.973034945, 184.391358735, 
170.721967787, 185.26387305, 185.845549261, 185.845549261, 186.427225471, 
170.721967787, 186.718063577, 186.718063577, 171.012805893, 171.012805893, 
188.463092208, 171.012805893, 189.335606524, 189.626444629, 171.012805893, 
190.78979705, 170.721967787, 191.662311366, 170.721967787, 192.825663787, 
170.721967787, 170.721967787, 193.698178103, 170.721967787, 170.721967787, 
170.721967787, 170.721967787, 170.721967787, 170.721967787, 170.721967787, 
170.721967787, 170.721967787, 170.721967787, 170.721967787, 170.721967787, 
170.721967787, 170.721967787, 170.721967787, 170.721967787, 170.721967787, 
170.721967787, 170.721967787, 170.721967787, 170.721967787, 170.721967787, 
170.721967787, 170.721967787, 170.721967787), T = c(553, 554, 
555, 556, 557, 558, 559, 560, 561, 562, 562, 563, 563, 564, 565, 
566, 567, 567, 568, 568, 569, 569, 570, 570, 571, 571, 572, 572, 
573, 573, 574, 574, 575, 575, 576, 577, 578, 579, 580, 581, 582, 
583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 
596, 597, 598, 599, 600), compID = c("Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238", 
"Day8-Series004-238", "Day8-Series004-238", "Day8-Series004-238"
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-59L), groups = structure(list(compID = "Day8-Series004-238", 
    .rows = structure(list(1:59), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

我想做的是:

  1. 删除我成功完成的所有重复的 T(时间)。
  2. 当连续 T 之间的差值大于 1 时,创建一个新的 compID。-> 成功完成
  3. 这个 compID 应该填充连续的 T 直到它遇到由 #2 生成的 compID。-> 这就是我需要帮助的地方!

这是我取得的成就:

    ID        X        Y   T             compID  T2               compID2
1  238 54.67756 177.4112 553 Day8-Series004-238  NA    Day8-Series004-238
2  238 54.96840 179.4471 554 Day8-Series004-238 553    Day8-Series004-238
3  238 54.96840 180.3196 555 Day8-Series004-238 554    Day8-Series004-238
4  238 55.25924 180.9013 556 Day8-Series004-238 555    Day8-Series004-238
5  238 55.25924 181.7738 557 Day8-Series004-238 556    Day8-Series004-238
6  238 55.84092 182.3555 558 Day8-Series004-238 557    Day8-Series004-238
7  238 55.84092 182.9372 559 Day8-Series004-238 558    Day8-Series004-238
8  238 56.13175 182.9372 560 Day8-Series004-238 559    Day8-Series004-238
9  238 55.55008 183.8097 561 Day8-Series004-238 560    Day8-Series004-238
10 238 56.42259 185.8455 564 Day8-Series004-238 561 Day8-Series004-238.10
11 238 56.42259 185.8455 565 Day8-Series004-238 564    Day8-Series004-238
12 238 56.42259 186.4272 566 Day8-Series004-238 565    Day8-Series004-238
13 238 59.62181 170.7220 576 Day8-Series004-238 566 Day8-Series004-238.13
14 238 59.62181 170.7220 577 Day8-Series004-238 576    Day8-Series004-238
15 238 59.62181 170.7220 578 Day8-Series004-238 577    Day8-Series004-238
16 238 59.62181 170.7220 579 Day8-Series004-238 578    Day8-Series004-238
17 238 59.62181 170.7220 580 Day8-Series004-238 579    Day8-Series004-238

使用此代码:

df2 <- df %>%
  arrange (T) %>%  
  filter(! (duplicated (T) | duplicated (T, fromLast=TRUE))) %>%
  mutate (T2=lag (T)) %>%
  mutate (compID2 = ifelse ((T-T2)==1, compID, paste (compID, 1:n(), sep="."))) %>%
  fill (compID2, .direction ="up") 

结果应该是这样的:

    ID        X        Y   T             compID  T2               compID2
1  238 54.67756 177.4112 553 Day8-Series004-238  NA    Day8-Series004-238
2  238 54.96840 179.4471 554 Day8-Series004-238 553    Day8-Series004-238
3  238 54.96840 180.3196 555 Day8-Series004-238 554    Day8-Series004-238
4  238 55.25924 180.9013 556 Day8-Series004-238 555    Day8-Series004-238
5  238 55.25924 181.7738 557 Day8-Series004-238 556    Day8-Series004-238
6  238 55.84092 182.3555 558 Day8-Series004-238 557    Day8-Series004-238
7  238 55.84092 182.9372 559 Day8-Series004-238 558    Day8-Series004-238
8  238 56.13175 182.9372 560 Day8-Series004-238 559    Day8-Series004-238
9  238 55.55008 183.8097 561 Day8-Series004-238 560    Day8-Series004-238
10 238 56.42259 185.8455 564 Day8-Series004-238 561 Day8-Series004-238.10
11 238 56.42259 185.8455 565 Day8-Series004-238 564    Day8-Series004-238.10
12 238 56.42259 186.4272 566 Day8-Series004-238 565    Day8-Series004-238.10
13 238 59.62181 170.7220 576 Day8-Series004-238 566 Day8-Series004-238.13
14 238 59.62181 170.7220 577 Day8-Series004-238 576    Day8-Series004-238.13
15 238 59.62181 170.7220 578 Day8-Series004-238 577    Day8-Series004-238.13
16 238 59.62181 170.7220 579 Day8-Series004-238 578    Day8-Series004-238.13
17 238 59.62181 170.7220 580 Day8-Series004-238 579    Day8-Series004-238.13

任何帮助将不胜感激!如果你也觉得1和2有更高效的方法,也请推荐!

谢谢!

【问题讨论】:

  • 嗨@Kaye11,你能解决这个问题吗?

标签: r dplyr data-wrangling


【解决方案1】:

您已经完成了 90% 的工作。这可能不是最可靠的解决方案,但它适用于示例数据:

library(dplyr)
library(tidyr)

# `T` is a logical in R
# Rename T to Time
names(df)
#> [1] "ID"     "X"      "Y"      "T"      "compID"
names(df) <- c("ID", "X", "Y", "Time", "compID")

df %>%
  # Get Times that only appear once
  # Note this is different from getting unique Times!
  filter(!(duplicated(Time) | duplicated(Time, fromLast=TRUE))) %>%
  # Calculate difference between successive Times
  mutate(diff = Time-lag(Time)) %>%
  # Define new Time variable
  mutate(Time2 = ifelse(diff==1, Time-1, lag(Time))) %>%
  # Define new compID variable
  mutate(compID2 = ifelse(is.na(diff), compID,
                          ifelse(diff>1, paste(compID, 1:n(), sep="."), NA))) %>%
  fill(compID2, .direction="down")

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2018-04-07
    • 2021-09-24
    • 1970-01-01
    • 1970-01-01
    • 2023-03-09
    • 2021-05-03
    • 2014-05-22
    • 1970-01-01
    相关资源
    最近更新 更多