如何在R中获取XML中的孩子数答案

【问题标题】：How to get the number of children in XML in R如何在R中获取XML中的孩子数
【发布时间】：2018-06-13 01:12:52
【问题描述】：

我是 R 新手。现在我想解析一个 XML 文件 (https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml)，除了每篇文章的作者数量外，每次都进行配对。我采用了Efficiently get the number of children with specific name using XML and R的一段代码：

authors_number = xpathSApply(xmldata, "count(//PubmedArticle/MedlineCitation/Article/AuthorList/Author/LastName)", xmlValue)

但它返回 XML 中的作者总数。其余的解析由

完成

library(tidyverse)
library(XML)
library(methods)
xmldata <- xmlParse("pubmedsample.jun18.xml", useInternalNodes = TRUE)
publication <- tibble(PMID = as.numeric(xpathSApply(xmldata, '//MedlineCitation/PMID', xmlValue)),

                       ISSN = xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./Article/Journal/ISSN)")) {
                          xpathSApply(x, "./Article/Journal/ISSN", xmlValue)
                         } else {
                           NA
                         }}),#parse ISSN

                      data_completed_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Year)")) {
                          xpathSApply(x, "./DateCompleted/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Month)")) {
                          xpathSApply(x, "./DateCompleted/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Day)")) {
                          xpathSApply(x, "./DateCompleted/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Year)")) {
                          xpathSApply(x, "./DateRevised/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Month)")) {
                          xpathSApply(x, "./DateRevised/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Day)")) {
                          xpathSApply(x, "./DateRevised/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      publication_type = as.character(xpathSApply(xmldata, '//PublicationTypeList', xmlValue))[1],#parse the first type, if more than one
                      article_title = as.character(xpathSApply(xmldata, '//ArticleTitle', xmlValue))) %>%

  mutate(completed_date = as.character(make_date(data_completed_year, data_completed_month, data_completed_day)), revised_date = as.character(make_date(data_revised_year, data_revised_month, data_revised_day))) %>%

  select(PMID, ISSN, completed_date, revised_date, publication_type, article_title)

有人可以教我如何获取每篇文章的作者数量吗？非常感谢！

【问题讨论】：

标签： r xml

【解决方案1】：

为了让你开始，这就是我要做的：

# Convert MedlineCitation node from XML file to a list
lst <- lapply(xmlToList(xmldata), function(x) x$MedlineCitation)

# Extract the AuthorList node
lst.author <- lapply(lst, function(x) x$Article$AuthorList);

# Count the number of authors
n.author <- sapply(lst.author, function(x) sum(names(x) == "Author"));
#PubmedArticle PubmedArticle PubmedArticle PubmedArticle PubmedArticle
#            1             0            10             1             6
#PubmedArticle
#            2

我发现使用list 比使用XMLInternalDocument 更容易，因此进行了转换。然后任务归结为导航嵌套的list 并提取相关信息。

【讨论】：

非常感谢！我可以弄清楚剩下的部分。
不客气@Vincent；祝你工作顺利。

【解决方案2】：

对于使用xml2 和purrr 的人：

library(xml2)
library(purrr)

doc <- read_xml("https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml")

xml_find_all(doc, ".//MedlineCitation/Article") %>% 
  map_dbl(~xml_find_first(.x, "count(.//AuthorList/Author/LastName)"))
##   [1]   1   0  10   1   6   2   4   2   5   4  11   3   4   8   2   3
##  [17]   7  25   5   3   3   0   1   2   2   6   2   4   1   1   4   4
##  [33]   8   2  25   7   6   5   2   3   8   2   6   7   9   2   3  10
##  [49] 256   6   9   5   9   0   4   3   2   9   2   4   4   2   2   1
##  [65]   1   2   1   1   1   1   1   1   4   5   4   1   0   2   1   5
##  [81]   2   2   1   1  11   1   4   1   2   4   2   3   1   1   1   1
##  [97]   0   0   1   1   1   1  10   1 620   4   5   5   1   7   4   3
## [113]   1   2   4   3   9   1   1   3   2   1   3   1   6   4   5   3
## [129]   2   2   5   9   2   2   1  23   3   2   1  14  41  12  12   1
## [145]   4   3   0   2   3   2   7   0   1   1   9   1   2   2   2  18
## [161]   4   2   7   5   1   9   5  14   0   0  10  20   0   0   0   0
## [177]   0

【讨论】：

【解决方案3】：

XPath 解决方案是统计每个 PubmedArticle 中 AuthorList 节点的 xmlChildren 数量：

library(XML)
library(tidyverse)
library(plyr)

xmlParse("pubmedsample.jun18.xml") -> doc

getNodeSet(doc, "//PubmedArticle") -> articles

ldply(articles, function(x) {
  xpathSApply(x, ".//Article/Journal/ISSN", xmlValue) -> ISSN
  xpathSApply(x, ".//DateCompleted/Year", xmlValue) -> data_completed_year

  xpathSApply(x, ".//AuthorList", xmlChildren) %>% length() -> author_count
  ifelse(author_count > 0, author_count, NA) -> authors

  # ... #

  data_frame(ISSN, data_completed_year, authors)

}) %>%
  tbl_df() ->
  output

head(output)

输出：

# A tibble: 161 x 3
  ISSN      data_completed_year authors
  <chr>     <chr>                 <int>
1 0095-3814 1976                      1
2 0377-8231 1993                     NA
3 0022-2623 1991                     10
4 0021-8820 1987                      6
5 0014-2956 1994                      2
6 1051-0443 1993                      4
7 0017-0011 1996                      2
8 0026-895X 1996                      5
9 1059-2725 1996                      4
10 0009-7322 1997                     11
# ... with 151 more rows

【讨论】：