【发布时间】:2014-12-29 16:31:11
【问题描述】:
我必须从 R igraph 中导出 graphml file 才能手动添加列值。当我想再次导入 graphml 文件时,它必须是正确的 UTF-8 和有效的 xml。因此,我使用 iconv() 在保存为 UTF-8 之前将数据转换为 UTF-8,正如您在下面我的代码的 for 循环中看到的那样
library(igraph)
edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quote="");
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
for (i in 1:ncol(edges)) {
edges[,i] <- iconv(edges[,i], to="UTF-8", sub="");
if (is.character(edges[,i])) {
edges[,i] <- gsub("[[:cntrl:]]", "", edges[,i])
}
}
sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;
print("REPORT: vertices data frames filled")
sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");
nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL;
Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL;
Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");
nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");
nrow(nodes);
edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");
g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");
outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");
# Filter
nodes <- get.data.frame(g, "vertices");
nodes <- nodes[order(nodes$OutDegree, decreasing = TRUE),];
nrow(nodes);
minOutDegree <- nodes[1335,"OutDegree"]; # 1335
minOutDegree;
nodes <- nodes[order(nodes$InDegree, decreasing = TRUE),];
minInDegree <- nodes[1335,"InDegree"];
minInDegree;
nodes2 <- subset(nodes, nodes$OutDegree >= minOutDegree | nodes$InDegree >= minInDegree);
nrow(nodes2);
nodes3 <- subset(nodes, nodes$OutDegree >= minOutDegree & nodes$InDegree >= minInDegree);
nrow(nodes3);
g <- set.vertex.attribute(g, "Group", V(g), NA);
g <- induced.subgraph(g, V(g)$OutDegree >= minOutDegree | V(g)$InDegree >= minInDegree);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > 0 & V(g)$InDegree > 0);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > (V(g)$InDegree / 3));
length(E(g));
length(V(g));
write.graph(g, "SomePath");
print("REPORT: Subgraph Test saved");
当我再次使用 read.graph 导入 graphml 文件时,我得到了错误:
Error in .Call("R_igraph_read_graph_graphml", file, as.numeric(index), :
At foreign-graphml.c:1202 :
å
, Parse error
因此我使用 XMLValidatorBuddy 来验证 graphml 文件(在下拉字段中选择了 UTF-8 作为使用的编码,但无论选择哪种编码都会发生错误)。这是我得到错误的地方:
无效字节'?'在 2 字节序列的第 2 位
根据 XMLValidatorBuddy,错误发生在第 4278 行。
this 问题的答案对我没有帮助,因为由于 R 中的转换,我应该有一个 UTF-8 编码的 graphml 文件。
【问题讨论】:
-
你能做一个重现问题的minimal例子吗?你的这个代码转储依赖于一个我们没有的文件,然后做了一百件事,其中任何一件都可能是问题。唯一的链接是一个 6Mb XML 文件,我懒得下载。你能制作一个小图并生成一个类似损坏的 graphml 文件吗?当然,它只需要几行,最多十几行。
-
我试图缩小图表范围,以便只留下与可能导致问题的节点相关的边,因为第 4278 行包含该节点的数据。但我没有错误。不知道来源不知道如何重现错误...
-
你删除了链接文件吗?
-
抱歉,我在更改文件时忘记更新超链接。第一行中“graphml 文件”后面的链接现在指向与第二行代码中的 read.graph 相同的文件。问题在此期间得到了解决
标签: xml r encoding utf-8 igraph