在 R 中转换为 Edgelist 时保留顶点名称答案

【问题标题】：Keeping Vertex Names when Converting to Edgelist in R在 R 中转换为 Edgelist 时保留顶点名称
【发布时间】：2020-02-04 14:41:34
【问题描述】：

我正在尝试将社会矩阵（使用 r 中的 sna 包）转换为边缘列表，但保留顶点名称。不幸的是，我似乎走上了死胡同。

对于上下文，我使用的是 sna 包，因为我需要对数据进行对称化，这是在将其放入 edgelist 格式之前所做的。本质上，我需要获取一个网络矩阵，对其进行对称化，将其转换为边缘列表，然后将其写入 .csv。将其转换为边缘列表的步骤是我遇到问题的地方。这是我遇到的一个例子：

library(sna)

testmat <-rgraph("6")

colnames(testmat)  <- c("153","154", "155", "156", "157", "158")
rownames(testmat)  <- c("153","154", "155", "156", "157", "158")

maxsymmetrizedfile <-symmetrize(testmat, rule = "weak")
rownames(maxsymmetrizedfile) <- rownames(testmat)
colnames(maxsymmetrizedfile) <- colnames(testmat)
as.edgelist.sna(maxsymmetrizedfile)
maxsymm_edge<-as.edgelist.sna(maxsymmetrizedfile)

当我这样做时，这就是我得到的：

      snd rec val
 [1,]   2   1   1
 [2,]   3   1   1
 [3,]   4   1   1
 [4,]   5   1   1
 [5,]   6   1   1
 [6,]   1   2   1
 [7,]   3   2   1
 [8,]   5   2   1
 [9,]   1   3   1
[10,]   2   3   1
[11,]   4   3   1
[12,]   5   3   1
[13,]   6   3   1
[14,]   1   4   1
[15,]   3   4   1
[16,]   6   4   1
[17,]   1   5   1
[18,]   2   5   1
[19,]   3   5   1
[20,]   1   6   1
[21,]   3   6   1
[22,]   4   6   1

当我真正想要的是名称（“153”、“154”、“155”、“156”、“157”、“158”）而不是 1-6 时。

（运行代码时实际的矩阵/边列表会有所不同，因为图形是随机生成的，但应该做同样的事情）

您对我需要做些什么来完成这项工作有什么建议吗？我过去使用过 network 和 igraph 包，但是当我将 socialmatrix 转换为图形/网络对象并尝试使用其他包转换为 edgelist 时，我得到了其他不同的错误（无法传递“n”参数)。

【问题讨论】：

标签： r networking sna edge-list

【解决方案1】：

这是您的数据：

vertex_names <- c("153","154", "155", "156", "157", "158")

testmat <- sna::rgraph(length(vertex_names))
dimnames(testmat) <- list(vertex_names, vertex_names)
testmat
#>     153 154 155 156 157 158
#> 153   0   0   0   1   1   0
#> 154   1   0   0   1   0   1
#> 155   1   1   0   0   0   1
#> 156   1   0   1   0   1   1
#> 157   1   0   1   1   0   0
#> 158   0   1   1   1   1   0

maxsymmetrizedfile <- sna::symmetrize(testmat, rule = "weak")
dimnames(maxsymmetrizedfile) <- dimnames(testmat)
maxsymmetrizedfile
#>     153 154 155 156 157 158
#> 153   0   1   1   1   1   0
#> 154   1   0   1   1   0   1
#> 155   1   1   0   1   1   1
#> 156   1   1   1   0   1   1
#> 157   1   0   1   1   0   1
#> 158   0   1   1   1   1   0

maxsymm_edge 有一个名为 "vnames" 的属性，您的示例中缺少该属性。

maxsymm_edge <- sna::as.edgelist.sna(maxsymmetrizedfile)
maxsymm_edge
#>       snd rec val
#>  [1,]   2   1   1
#>  [2,]   3   1   1
#>  [3,]   4   1   1
#>  [4,]   5   1   1
#>  [5,]   1   2   1
#>  [6,]   3   2   1
#>  [7,]   4   2   1
#>  [8,]   6   2   1
#>  [9,]   1   3   1
#> [10,]   2   3   1
#> [11,]   4   3   1
#> [12,]   5   3   1
#> [13,]   6   3   1
#> [14,]   1   4   1
#> [15,]   2   4   1
#> [16,]   3   4   1
#> [17,]   5   4   1
#> [18,]   6   4   1
#> [19,]   1   5   1
#> [20,]   3   5   1
#> [21,]   4   5   1
#> [22,]   6   5   1
#> [23,]   2   6   1
#> [24,]   3   6   1
#> [25,]   4   6   1
#> [26,]   5   6   1
#> attr(,"n")
#> [1] 6
#> attr(,"vnames")
#> [1] "153" "154" "155" "156" "157" "158" # *********

我们可以索引到边缘列表中包含的"vnames"（与vertex_names相同）。

(vnames <- attr(maxsymm_edge, "vnames"))
#> [1] "153" "154" "155" "156" "157" "158"

(snd_indices <- maxsymm_edge[, "snd"])
#>  [1] 2 3 4 5 1 3 4 6 1 2 4 5 6 1 2 3 5 6 1 3 4 6 2 3 4 5
vnames[snd_indices]
#>  [1] "154" "155" "156" "157" "153" "155" "156" "158" "153" "154" "156" "157"
#> [13] "158" "153" "154" "155" "157" "158" "153" "155" "156" "158" "154" "155"
#> [25] "156" "157"

(rec_indices <- maxsymm_edge[, "snd"])
#>  [1] 2 3 4 5 1 3 4 6 1 2 4 5 6 1 2 3 5 6 1 3 4 6 2 3 4 5
vnames[rec_indices]
#>  [1] "154" "155" "156" "157" "153" "155" "156" "158" "153" "154" "156" "157"
#> [13] "158" "153" "154" "155" "157" "158" "153" "155" "156" "158" "154" "155"
#> [25] "156" "157"

所以我们可以像这样直接构建一个数据框：

el_df <- data.frame(
  snd = attr(maxsymm_edge, "vnames")[maxsymm_edge[, "snd"]],
  rec = attr(maxsymm_edge, "vnames")[maxsymm_edge[, "rec"]],
  val = maxsymm_edge[, "val"],
  
  stringsAsFactors = FALSE # the default if R.Version()$major >= 4
)
el_df
#>    snd rec val
#> 1  154 153   1
#> 2  155 153   1
#> 3  156 153   1
#> 4  157 153   1
#> 5  153 154   1
#> 6  155 154   1
#> 7  156 154   1
#> 8  158 154   1
#> 9  153 155   1
#> 10 154 155   1
#> 11 156 155   1
#> 12 157 155   1
#> 13 158 155   1
#> 14 153 156   1
#> 15 154 156   1
#> 16 155 156   1
#> 17 157 156   1
#> 18 158 156   1
#> 19 153 157   1
#> 20 155 157   1
#> 21 156 157   1
#> 22 158 157   1
#> 23 154 158   1
#> 24 155 158   1
#> 25 156 158   1
#> 26 157 158   1

为什么是数据框而不是矩阵？因为顶点名称和"val" 属于不同类型（character 与 double），所以尝试这样做（充其量）会将"val" 强制转换为一堆字符串。

str(el_df)
#> 'data.frame':    18 obs. of  3 variables:
#>  $ snd: chr  "154" "155" "156" "157" ...
#>  $ rec: chr  "153" "153" "153" "153" ...
#>  $ val: num  1 1 1 1 1 1 1 1 1 1 ...

但是，这仅在您要使用 "val" 时才重要。网络没有加权，因此您可以索引"vnames" 来构建矩阵边缘列表（或使用as.matrix(el_df[, 1:2]) 删除该列并从数据框转到矩阵）。

考虑到所有这些，我们可以更进一步，构建一个处理整个操作的函数：

as_edge_list_df <- function(adj_mat, use_vertex_names = TRUE) {
  melted <- do.call(cbind, lapply(list(row(adj_mat), col(adj_mat), adj_mat), as.vector)) # 3 col matrix of row index, col index, and `x`'s values
  filtered <- melted[melted[, 3] != 0, ] # drop rows where column 3 is 0
  
  if (use_vertex_names && !is.null(dimnames(adj_mat))) { # in case we don't want vertex names  
    if (!all(rownames(adj_mat) == colnames(adj_mat))) {  # in case `adj_mat` is malformed
      stop("row names do not match column names.")
    }
    vertex_names <- rownames(adj_mat)
    data.frame(
      snd = vertex_names[filtered[, 1L]],
      rec = vertex_names[filtered[, 2L]],
      val = filtered[, 3L]
    )
  } else {
    data.frame(
      snd = filtered[, 1L],
      rec = filtered[, 2L],
      val = filtered[, 3L]
    )
  }
}

那就去试驾吧……

el_df2 <- as_edge_list_df(maxsymmetrizedfile)
el_df2
#>    snd rec val
#> 1  154 153   1
#> 2  155 153   1
#> 3  156 153   1
#> 4  157 153   1
#> 5  153 154   1
#> 6  155 154   1
#> 7  156 154   1
#> 8  158 154   1
#> 9  153 155   1
#> 10 154 155   1
#> 11 156 155   1
#> 12 157 155   1
#> 13 158 155   1
#> 14 153 156   1
#> 15 154 156   1
#> 16 155 156   1
#> 17 157 156   1
#> 18 158 156   1
#> 19 153 157   1
#> 20 155 157   1
#> 21 156 157   1
#> 22 158 157   1
#> 23 154 158   1
#> 24 155 158   1
#> 25 156 158   1
#> 26 157 158   1

...并验证它与我们在构建el_df 时所做的完全相同相同。

stopifnot(identical(el_df, el_df2))

【讨论】：