这是我的解决方案。
它基于此处的算法 (https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541),但使用 data.table、shift 和矢量化 ifelse 语句来提高效率。它还与算法不同,因为我的代码允许对由 id_column 标识的多个数据集单独执行此操作。我的方法也忽略了跟踪行(即“属性”),因为当间隔可以很容易地使用foverlaps 合并回原始数据时,没有必要定义它。 foverlaps 也用于排除间隙
请告诉我您是否发现效率低下
remove_overlaps <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
#excluding end_next when missing should cause this to ignore the last row in each group
#because this element will be NA as defined by shift
temp <- xd[,.SD[!is.na(end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp)
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")
另外,我不认为that page 上链接的建议对于如何排除差距是正确的。
此答案未考虑间隙(间隙不应出现在
输出),所以我对其进行了细化: * 如果 e=false,则在 S 中添加 a。如果 e=true,则取
* 定义 n'=n 如果 e=false 或 n'=n+1 如果 e=true * 定义
m'=m-1 if f=false or m'=m if f=true * If n'
这是在 R 中实现的此代码算法的第二个版本:remove_overlaps 没有明确使用silentman。它建议排除间隙,而 remove_overlaps1 使用该建议。请注意,这两个函数都通过随后对 foverlaps 的调用排除了间隙,只有当它们与 x 中的那些(原始数据)部分匹配时才会返回间隔。
library(data.table)
remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
示例数据:
library(data.table)
x <-
structure(
list(
native_id = c(
"1",
"1",
"1",
"1",
"1"
),
n_start_date = c(14761, 14775,
14789, 14803, 14817),
n_end_date = c(14776, 14790, 14804, 14818,
14832),
obs = c(
31.668140525481,
34.8623263656539,
35.0841466093899,
37.2281249364127,
36.3726151694052
)
),
row.names = c(NA,-5L),
class = "data.frame",
.Names = c("native_id",
"n_start_date", "n_end_date", "obs")
)
setDT(x)
> x
native_id n_start_date n_end_date obs
1: 1 14761 14776 31.66814
2: 1 14775 14790 34.86233
3: 1 14789 14804 35.08415
4: 1 14803 14818 37.22812
5: 1 14817 14832 36.37262
结果:
> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14777 14788 14775 14790 34.86233
5: 1 14789 14790 14775 14790 34.86233
6: 1 14789 14790 14789 14804 35.08415
7: 1 14791 14802 14789 14804 35.08415
8: 1 14803 14804 14789 14804 35.08415
9: 1 14803 14804 14803 14818 37.22812
10: 1 14805 14816 14803 14818 37.22812
11: 1 14817 14818 14803 14818 37.22812
12: 1 14817 14818 14817 14832 36.37262
13: 1 14819 14832 14817 14832 36.37262
看似不正确,排除了太多区间:
> remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14789 14790 14775 14790 34.86233
5: 1 14789 14790 14789 14804 35.08415
6: 1 14803 14804 14789 14804 35.08415
7: 1 14803 14804 14803 14818 37.22812
8: 1 14817 14818 14803 14818 37.22812
9: 1 14817 14818 14817 14832 36.37262
10: 1 14819 14832 14817 14832 36.37262