考虑一个 XML 包解决方案,需要使用 xpathSApply() 和 for 循环和 if/then 逻辑的各种 XPath 表达式。为了捕获跨行的表记录,使用了各种 XPath 字符串函数:string-length(), concat(), and substring():
library(XML)
# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()
# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))
for (i in (1:numberofteams+1)) {
# TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {
code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
} else {
# TR NODES W/O LETTER TEAM ABBREVIATION
code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
//table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
}
}
# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code),
team = unlist(team),
year = unlist(year),
postseason = unlist(postseason))