【问题标题】：Count occurrence of nodes in vertex of open triangles using igraph in R使用R中的igraph计算开放三角形顶点中节点的出现次数
【发布时间】：2021-06-28 23:39:09
【问题描述】：

在篮球运动员之间的传球网络中，我想：

检测网络中的开放三角形
计算处于中间位置的唯一玩家数量（A 传给 B 和 C；B 和 C 不互相传球；A 正在中间）
计算这些玩家促成开放三角形的次数

根据这个问题Extracting Open Triangles in R Igraph (Network Analysis)，我们可以执行以下操作：

library(igraph)

set.seed(1234)
G <- sample_gnm(10, 15)

G
IGRAPH 72f8e6a U--- 10 15 -- Erdos renyi (gnm) graph
+ attr: name (g/c), type (g/c), loops (g/l), m (g/n)
+ edges from 72f8e6a:
[1] 1-- 3 1-- 4 3-- 4 1-- 5 3-- 5 6-- 7 3-- 8 4-- 8 6-- 8 7-- 8 2-- 9 6-- 9 7-- 9 4--10 9--10
 
plot(G)

找到空心三角形：

openTriList <- unique(do.call(c, lapply(as_ids(V(G)), function(v) {
    do.call(c, lapply(as_ids(neighbors(G, v)), function(v1) {
    v2 <- as_ids(neighbors(G, v1))
    v2 <- v2[shortest.paths(G, v, v2) == 2]

    if(length(v2) != 0) {
        lapply(v2, function(vv2) { c(v, v1, vv2)[order(c(v, v1, vv2))] })
    } else { list() }
    }))
})))

结果正确：

do.call(rbind, openTriList)
  [,1] [,2] [,3]
  [1,]    1    3    8
  [2,]    1    4    8
  [3,]    1    4   10
  [4,]    2    6    9
  [5,]    2    7    9
  [6,]    2    9   10
  [7,]    3    4   10
  [8,]    3    6    8
  [9,]    3    7    8
  [10,]    1    4    5
  [11,]    3    4    5
  [12,]    4    6    8
  [13,]    4    7    8
  [14,]    4    9   10
  [15,]    3    5    8
  [16,]    6    9   10
  [17,]    7    9   10
  [18,]    4    8   10
  [19,]    6    8    9
  [20,]    7    8    9

我们如何找到作为经纪人的玩家？

玩家 2 在此列表中，因为它是一个开放三角形的一部分，但不是经纪人。我们忽略了这个玩家。

我们如何有效地计算这些玩家促成开放三角形的次数？

玩家 9 正在代理 5 个空心三角形。

[真实数据包含数百万次传球和数千名球员。所以性能是一个很重要的方面。使用combn 会导致计算时间非常长。有更快的方法吗？也许让邻接图构建一个稀疏矩阵并将其转换为data.table 对象以供邻居加入？看到这个link。 ]

【问题讨论】：

标签： r performance networking igraph

【解决方案1】：

更新

如果你想加快速度，下面是使用for loops + combn 定义用户函数f 的选项，它输出一个包含openTriList 和occurCnt 的列表（感谢@minem 的反馈以及性能改进）：

f <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  resLst <- c()
  for (broker in 1:nrow(dmat)) {
    k <- which(dmat[broker, ] == 1)
    if (length(k) > 1) {
      inds <- t(combn(k, 2))
      resLst[[broker]] <- subset(cbind(broker, inds), dmat[inds] == 0)
    }
  }
  resLst <- do.call(rbind, resLst)
  resCnt <- table(resLst[, "broker"])
  list(openTriLst = resLst, occurCnt = resCnt)
}

你会看到他们可以达到预期的输出

> set.seed(1234)

> G <- sample_gnm(10, 15)

> f(G)
$openTriLst
      broker
 [1,]      1 4  5
 [2,]      3 1  8
 [3,]      3 4  5
 [4,]      3 5  8
 [5,]      4 1  8
 [6,]      4 1 10
 [7,]      4 3 10
 [8,]      4 8 10
 [9,]      6 8  9
[10,]      7 8  9
[11,]      8 3  6
[12,]      8 3  7
[13,]      8 4  6
[14,]      8 4  7
[15,]      9 2  6
[16,]      9 2  7
[17,]      9 2 10
[18,]      9 6 10
[19,]      9 7 10
[20,]     10 4  9

$occurCnt

 1  3  4  6  7  8  9 10
 1  3  4  1  1  4  5  1

而且速度比我之前的答案有了显着提高。您也可以将其与answer by @minem 进行比较。

> set.seed(1234)

> G1 <- sample_gnm(1000, 4000)

> system.time(f(G1))
   user  system elapsed 
   0.07    0.00    0.08

> G2 <- sample_gnm(10000, 40000)

> system.time(f(G2))
   user  system elapsed 
   2.46    0.16    2.62

上一个答案

您可以使用combn + are_ajdacent 尝试下面的代码，例如，

G <- sample_gnm(10, 15) %>%
  get.data.frame() %>%
  graph_from_data_frame(directed = FALSE)

openTriList <- do.call(
  rbind,
  sapply(
    names(V(G)),
    function(v) {
      nbs <- names(neighbors(G, v))
      if (length(nbs) > 1) {
        do.call(rbind, Filter(length, combn(nbs, 2, FUN = function(x) {
          if (!are_adjacent(G, x[1], x[2])) {
            sort(as.numeric(c(v, x)))
          }
        }, simplify = FALSE)))
      }
    }
  )
)

occurCount <- na.omit(
  sapply(names(V(G)), function(v) {
    nbs <- names(neighbors(G, v))
    ifelse(length(nbs) > 1,
      sum(!combn(nbs,
        2,
        FUN = function(x) are_adjacent(G, x[1], x[2])
      )),
      NA
    )
  })
)

你会得到命名向量

> openTriList
      [,1] [,2] [,3]
 [1,]    1    4    5
 [2,]    1    3    8
 [3,]    3    4    5
 [4,]    3    5    8
 [5,]    6    8    9
 [6,]    1    4    8
 [7,]    1    4   10
 [8,]    3    4   10
 [9,]    4    8   10
[10,]    7    8    9
[11,]    2    6    9
[12,]    6    9   10
[13,]    2    7    9
[14,]    7    9   10
[15,]    2    9   10
[16,]    3    6    8
[17,]    3    7    8
[18,]    4    6    8
[19,]    4    7    8
[20,]    4    9   10

> occurCount
 1  3  6  4  7  9  5  8 10 
 1  3  1  4  1  5  0  4  1
attr(,"na.action")
2
6
attr(,"class")
[1] "omit"

【讨论】：

这很好用，但是combn 函数会减慢它的速度。我需要做的计算估计需要 500 小时。有没有可能的解决方法？
@wake_wake 我猜计算复杂度是O(n^3) 给定n 网络中的顶点。在这种情况下，如果您有数百万个顶点，那么它不可避免地会很重并且会花费大量时间。
我明白了。我现在删除了已回答的标记并分配了赏金以征求有效的方法。如果没有，我恢复你的答案。谢谢你，托马斯！
@wake_wake 请查看我的更新，如果我们采用 for 循环 + combn 方法，它会提高性能。
@ThomasIsCoding 查看fun_openTriList 的更新代码。只需少量代码更改就应该更快。我想你在哪里增长对象：res <- rbind(res, do.call(rbind, r))

【解决方案2】：

我修改了 Thomas 的 openTriList 计算：

require(data.table)

v2 <- function(a) {
  n <- names(V(a))
  d <- as.data.table(a %>% get.data.frame())
  d <- as.data.table(lapply(d, as.numeric))
  k1 <- d[[1]]
  k2 <- d[[2]]
  ks <- k1 + k2
  # v <- n[[1]] # for testing
  xx <- sapply(n, function(v) {
    nbs <- as.numeric(names(neighbors(a, v)))
    vn <- as.numeric(v)
    
    if (length(nbs) > 1) {
      i2 <- combn(nbs, 2, simplify = F)
      
      # reduce test vectors:
      ss <- sapply(i2, sum)
      ii <- ks %in% ss
      kk1 <- k1[ii]
      kk2 <- k2[ii]
      
      # reduce test vectors 2:
      i <- (kk1 %in% nbs) & (kk2 %in% nbs)
      kk1 <- kk1[i]
      kk2 <- kk2[i]
      
      # x <- i2[[1]] # for testing
      i3 <- lapply(i2, function(x) {
        q1 <- kk1 == x[1]
        q2 <- kk2 == x[2]
        zz <- q1 & q2
        r2 <- !any(zz)
        if (is.null(r2) || r2) {
          rs <- c(vn, x)
          rs <- .Internal(sort(rs, decreasing = F)) # less overhead
          rs
        }
      })
      s <- i3[lengths(i3) > 0]
      do.call(rbind, s)
      }
    }
  )
  do.call(rbind, xx)
}

openTriList <- v2(G)

这应该会明显更快。 combn 并不慢。慢是are_adjacent。 Thomas 的代码写得很复杂，很难调试和发现可能的减速... 测试：

set.seed(1)
G <- sample_gnm(1000, 4000)
G <- get.data.frame(G) %>% graph_from_data_frame(directed = FALSE)
system.time(r1 <- v1(G)) # 12.00 sec
system.time(r2 <- v2(G)) # 0.99 sec
all.equal(r1, r2) # TRUE

更新

fun_openTriList2 <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  res <- list()
  for (i in 1:nrow(dmat)) {
    inds <- which(dmat[i, ] == 1)
    if (length(inds) > 1) {
      r <- combn(inds, 2,
                 FUN = function(x) {
                   if (dmat[x[1], x[2]] == 0) {
                     .Internal(sort(c(i, x), decreasing = F))
                   }
                 },
                 simplify = F
      )
      res[[i]] <- do.call(rbind, r)
    }
  }
  res <- do.call(rbind, res)
  res
}

【讨论】：

很好地指出are_adjacent 是瓶颈。点赞你的答案！你可以看到我关于速度改进的更新。