【问题标题】：Find overlapping segments in multiple columns在多列中查找重叠段
【发布时间】：2015-08-28 12:52:08
【问题描述】：

我有以下数据集，并希望在单独的列中获取重叠标签的数量 (n.overlaps)、重叠标签的名称 (overlap.labels) 以及重叠的持续时间 (overlap.duration ）。

这是我的数据集：

label   begin   end
======================
lower   9.03    12.41
lower   28.773  29.975
lower   33.895  35.992
lower   46.814  48.854
lower   58.51   61.51
lower   62.971  63.491
upper   28.132  30.432
upper   46.716  50.82
upper   58.536  61.482
upper   29.975  33.895
upper   53.376  54.08
upper   10.358  11.958
upper   30.532  46.716
upper   51.633  58.536
head    9.918   14.818
head    29.823  30.623
head    58.802  61.404
head    61.404  63.562

我想要的表是这个：

lower.begin    lower.end    upper.begin     upper.end   head.begin  head.end    n.overlaps  overlap.labels       overlap.duration
9.03           12.41         10.358         11.958        9.918      14.418         3        lower|upper|head         1.6
28.773         29.975        28.132         30.432        29.823     30.623         3        lower|upper|head         0.152
33.895         35.992        30.532         46.716         -            -           2        lower|upper              2.097
...

这是表格的可视化表示：

数据

structure(list(label = c("lower", "lower", "lower", "lower", 
"lower", "lower", "upper", "upper", "upper", "upper", "upper", 
"upper", "upper", "upper", "head", "head", "head", "head"
), begin = c(9.03, 28.773, 33.895, 46.814, 58.51, 62.971, 28.132, 
46.716, 58.536, 29.975, 53.376, 10.358, 30.532, 51.633, 9.918, 
29.823, 58.802, 61.404), end = c(12.41, 29.975, 35.992, 48.854, 
61.51, 63.491, 30.432, 50.82, 61.482, 33.895, 54.08, 11.958, 
46.716, 58.536, 14.818, 30.623, 61.404, 63.562)), .Names = c("label", 
"begin", "end"), class = "data.frame", row.names = c(NA, -18L))

【问题讨论】：

标签： r overlap

【解决方案1】：

这确实是一个评论，但它带有一张图片。

您想要的输出非常不清楚。具体来说，您的示例数据似乎有三组重叠，青色、浅绿色和梅子色：

一旦我们同意这是三个重叠区域，您甚至不清楚您想要什么。

绘图代码

library(data.table); setDT(x)
cols<-c(lower="black",upper="blue",middle="red")
ys<-c(lower=1.8,upper=2.2,middle=2)
par(mar=c(2.1,4.1,4.1,1.1))
x[,{plot(1,type="n",xlim=range(onset,offset),
        ylim=c(1.7,2.3),yaxt="n",ylab="",xlab="",
        main="Depiction of Intervals")
  axis(side=2,at=ys[unique(label)],
       labels=unique(label),las=1)}]
rect(x[order(onset)][1,onset],1.7,
     x[order(offset)][3,offset],2.3,col="cyan")
rect(x[order(onset)][4,onset],1.7,
     x[order(offset)][11,offset],2.3,col="lightgreen")
rect(x[order(onset)][12,onset],1.7,
     x[order(offset)][18,offset],2.3,col="plum")
for (lbs in x[,unique(label)]){
  x[label==lbs,
    arrows(onset,ys[label],offset,ys[label],lwd=3,
           code=3,angle=90,length=.07,col=cols[label])]
}

【讨论】：

我添加了正确的表格图形表示。了解表格可能会很有用，我想拥有。

【解决方案2】：

这是一个开始。当我有更多时间时，我将添加最后三列。它看起来很复杂，但我使用lubridate 将持续时间转换为时间间隔。有一个名为new_interval 的函数可以创建它们，还有一个名为int_overlaps 的函数可以测试重叠。

更新

代码已完成。检查它是否有帮助。

library(lubridate)

starts <- as.POSIXct(df$begin, origin=Sys.time())
ends <- as.POSIXct(df$end, origin=Sys.time())

spans <- new_interval(starts, ends)
s <- split(spans, df$label)
d <- split(df, df$label)

overlap <- function(x1, x2) {

  out <- sapply(1:length(s[[x1]]), function(x) {
    which(int_overlaps(s[[x1]][x], s[[x2]]))}
    )

    mat_lst <- lapply(out, function(x) {
      matrix(c(d[[x2]]$begin[x],d[[x2]]$end[x]),ncol=2)}
      )

    mat_lst[lengths(mat_lst) == 0L] <- list(matrix(NA, ncol=2))
    mat_lst

}

lh <- overlap("lower", "head")
lu <- overlap("lower", "upper")
matches <- suppressWarnings(lapply(1:nrow(d$lower), function(x) {
  cbind(d$lower[x,2:3], lu[[x]], lh[[x]])}
))
new_df <- `names<-`(do.call(rbind, matches), c("lower.begin", "lower.end", "upper.begin", "upper.end", "head.begin", "head.end"))
rownames(new_df) <- NULL

#n.overlaps
count <- colSums(apply(new_df, 1, function(x) !is.na(x)))/2
new_df$n.overlaps <- ave(count, new_df$lower.begin, FUN=function(x) x+length(x)-1)

#overlap.labels
new_df$overlap.labels <- apply(new_df[1:6], 1, function(x) 
  paste(unique(gsub("\\..*", "", names(which(!is.na(x))))), collapse="|"))


#overlap.duration
first <- pmin(new_df$lower.end, new_df$upper.end)-new_df$upper.begin
second <- pmin(new_df$lower.end, new_df$head.end)-new_df$head.begin
overlap <- ifelse(is.na(first+second), ifelse(is.na(first), second, first), first+second)
new_df$overlap.duration <- ave(overlap, new_df$lower.begin, FUN=sum)
new_df
#   lower.begin lower.end upper.begin upper.end head.begin head.end n.overlaps
# 1       9.030    12.410      10.358    11.958      9.918   14.818          3
# 2      28.773    29.975      28.132    30.432     29.823   30.623          4
# 3      28.773    29.975      29.975    33.895     29.823   30.623          4
# 4      33.895    35.992      29.975    33.895         NA       NA          3
# 5      33.895    35.992      30.532    46.716         NA       NA          3
# 6      46.814    48.854      46.716    50.820         NA       NA          2
# 7      58.510    61.510      58.536    61.482     58.802   61.404          4
# 8      58.510    61.510      51.633    58.536     61.404   63.562          4
# 9      62.971    63.491          NA        NA     61.404   63.562          2
#     overlap.labels overlap.duration
# 1 lower|upper|head            4.092
# 2 lower|upper|head            2.147
# 3 lower|upper|head            2.147
# 4      lower|upper            9.380
# 5      lower|upper            9.380
# 6      lower|upper            2.138
# 7 lower|upper|head           12.557
# 8 lower|upper|head           12.557
# 9       lower|head            2.087

更新 #2

我修饰了matches 函数。它应该为更多种类做好准备。在旧脚本中替换它。

matches <- suppressWarnings(lapply(1:nrow(d$lower), function(x) {
  max.len <- max(length(c(length(lu[[x]]), length(lh[[x]]))))
  xu <- lu[[x]]
  xh <- lh[[x]]
  dim(xu) <- dim(xh) <- NULL
  length(xu) <- length(xh) <- max.len
  umat <- matrix(xu, byrow=T, ncol=2)
  hmat <- matrix(xh, byrow=T, ncol=2)
  cbind(d$lower[x,2:3], umat, hmat)}
))

数据

df <- structure(list(label = c("lower", "lower", "lower", "lower", 
"lower", "lower", "upper", "upper", "upper", "upper", "upper", 
"upper", "upper", "upper", "head", "head", "head", "head"
), begin = c(9.03, 28.773, 33.895, 46.814, 58.51, 62.971, 28.132, 
46.716, 58.536, 29.975, 53.376, 10.358, 30.532, 51.633, 9.918, 
29.823, 58.802, 61.404), end = c(12.41, 29.975, 35.992, 48.854, 
61.51, 63.491, 30.432, 50.82, 61.482, 33.895, 54.08, 11.958, 
46.716, 58.536, 14.818, 30.623, 61.404, 63.562)), .Names = c("label", 
"begin", "end"), class = "data.frame", row.names = c(NA, -18L))

【讨论】：

看起来很复杂，但这是一个很好的开始。这就是我一直在寻找的。希望您能抽出时间来完成缺少的三列。已经谢谢你的这部分了。
嘿皮埃尔！你有没有机会在接下来的几天内抽出时间来完成代码？那会很酷。
您有旧版本的 R。安装 3.2.2。它有许多新功能和错误修复。
如果这不起作用。将mat_lst[lengths(mat_lst) == 0L] 替换为mat_lst[unlist(lapply(mat_lst, length)) == 0L]
mat_lst[lengths(mat_lst) == 0L] <- list(matrix(NA, ncol=2)) 中有错字，应该是length。我无法运行matches <- suppressWarnings(lapply(1:nrow(d$lower), function(x) { cbind(d$lower[x,2:3], lu[[x]], lh[[x]])} ))。我收到一条错误消息：Error in data.frame(..., check.names = FALSE) : The arguments have a different numer of raws: 1, 2, 3... 知道如何解决吗？

【解决方案3】：

这是使用来自data.table 的foverlaps 的尝试：

subset_dat <- function(x, .label) {
  ans = x[label == .label]
  setnames(ans, paste(.label, names(ans), sep="_"))
}
setkey(setDT(dat), begin, end))
olaps1 = foverlaps(subset_dat(dat, "head"), subset_dat(dat, "lower"), type="any")
olaps2 = foverlaps(subset_dat(dat, "upper"), subset_dat(dat, "lower"), type="any")
ans  = merge(olaps1, olaps2, by=names(olaps1)[1:3], all=TRUE)

ans[, olap.labels := paste(lower_label, head_label, upper_label, sep="|")]
ans[, olap.labels := gsub("\\|NA|NA\\|", "", olap.labels)]
ans[, c("lower_label", "head_label", "upper_label") := NULL]
ans[, olap.count := sapply(gregexpr("\\|", olap.labels), function(x) sum(x != -1L)+1L)]
ans[, olap.interval := abs(pmax(lower_begin, head_begin, upper_begin, na.rm=TRUE) - 
                           pmin(lower_end, head_end, upper_end, na.rm=TRUE))]

#     lower_begin lower_end head_begin head_end upper_begin upper_end      olap.labels olap.count olap.interval
#  1:          NA        NA         NA       NA      53.376    54.080            upper          1         0.704
#  2:       9.030    12.410      9.918   14.818      10.358    11.958 lower|head|upper          3         1.600
#  3:      28.773    29.975     29.823   30.623      28.132    30.432 lower|head|upper          3         0.152
#  4:      28.773    29.975     29.823   30.623      29.975    33.895 lower|head|upper          3         0.000
#  5:      33.895    35.992         NA       NA      29.975    33.895      lower|upper          2         0.000
#  6:      33.895    35.992         NA       NA      30.532    46.716      lower|upper          2         2.097
#  7:      46.814    48.854         NA       NA      46.716    50.820      lower|upper          2         2.040
#  8:      58.510    61.510     58.802   61.404      51.633    58.536 lower|head|upper          3         0.266
#  9:      58.510    61.510     58.802   61.404      58.536    61.482 lower|head|upper          3         2.602
# 10:      58.510    61.510     61.404   63.562      51.633    58.536 lower|head|upper          3         2.868
# 11:      58.510    61.510     61.404   63.562      58.536    61.482 lower|head|upper          3         0.078
# 12:      62.971    63.491     61.404   63.562          NA        NA       lower|head          2         0.520

【讨论】：

我试图在提供的数据集上复制您的代码，但运行 setDT(dat, key=c("begin", "end")) 失败。我收到错误消息，指出 key 参数未使用。我提前安装了包data.table和overlaps。我不知道我在哪里失败了。
@user5264244,setDT()可以在当前开发版本中设置keys。通过调用setkey() 修复。现在应该可以工作了。