我创建了一个 XML 文件,我认为它与您可能使用的文件相似。我解析这个文件并使用函数get_xml_dat 提取数据。
file <- list.files('~/R/', pattern = '.xml', full.names = TRUE)
get_xml_dat <- function(xml_file, main_node, values) {
require(tidyverse)
require(rvest)
require(xml2)
nodeset <- xml_file %>% read_html() %>% html_nodes(tolower(main_node)) # pull out node with data from XML
node_values <- paste0("<", tolower(values), ">.*</", tolower(values), ">") # create pattern to extract values
dat <- str_extract_all(as.character(nodeset), paste0(node_values, collapse = "|")) # extract values from nodeset
lapply(dat, function(foo) {
foo %>% gsub("</.*>|<", "", .) %>% # clean up data values
tibble(V1 = .) %>% # transform string of values to long data_frames
separate(., # separate node name from node value
col = V1,
into = c("cols", "vals"),
sep = ">"
) %>%
spread(cols, vals) # long data_frame to wide data_frame
}) %>%
bind_rows() %>% # bind list of data_frames
select(tolower(values)) # orders columns
}
输出
> get_xml_dat(
+ xml_file = file, # XML file you want to get data from
+ main_node = 'Kontaktdaten', # node where the data is located
+ values = c('Name', 'ID', 'Hausnummer', 'Postleitzahl', 'Ort', 'Strasse') # values you want to get from the main_node
+ )
# A tibble: 3 x 6
name id hausnummer postleitzahl ort strasse
<chr> <chr> <chr> <chr> <chr> <chr>
1 " Name_ABC " 912283 1 12345 ABC ABC-Strasse
2 " Name_DEF " 123456 NA 12345 DEF DEF-Strasse
3 " Name_XYZ " 123456 3 12345 XYZ XYZ-Strasse
数据
加载到 R 中的 XML 文件。
<Qualitaetsbericht>
<Krankenhaus>
<Kontaktdaten>
<Name> Name_ABC </Name>
<ID>912283</ID>
<Kontakt_Zugang>
<Strasse>ABC-Strasse</Strasse>
<Hausnummer>1</Hausnummer>
<Postleitzahl>12345</Postleitzahl>
<Ort>ABC</Ort>
</Kontakt_Zugang>
</Kontaktdaten>
</Krankenhaus>
<Klinik>
<Kontaktdaten>
<Name> Name_DEF </Name>
<ID>123456</ID>
<Kontakt_Zugang>
<Strasse>DEF-Strasse</Strasse>
<Postleitzahl>12345</Postleitzahl>
<Ort>DEF</Ort>
</Kontakt_Zugang>
</Kontaktdaten>
</Klinik>
<Universitaet>
<Kontaktdaten>
<Name> Name_XYZ </Name>
<ID>123456</ID>
<Kontakt_Zugang>
<Strasse>XYZ-Strasse</Strasse>
<Hausnummer>3</Hausnummer>
<Postleitzahl>12345</Postleitzahl>
<Ort>XYZ</Ort>
</Kontakt_Zugang>
</Kontaktdaten>
</Universitaet>
<Other_DATA>
<Some_Var0>
<X>100</X>
<Y>100</Y>
<Z>100</Z>
</Some_Var0>
</Other_DATA>
</Qualitaetsbericht>