提取具有行跨值的html表答案

【问题标题】：Extracting html table with rowspan values提取具有行跨值的html表
【发布时间】：2015-12-06 19:52:47
【问题描述】：

我使用以下代码（使用 RCurl 和 XML 包）创建的数据框仅将三个字母的团队缩写放在它跨越的第一行中。是否可以添加其他包或附加代码以将数据保留在正确的列中？

library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])

【问题讨论】：

标签： xml r html-table rcurl

【解决方案1】：

你其实很接近。您唯一需要做的就是在正确的列和行中获取数据，因为某些行已向左移动。您可以按以下方式实现（借助 data.table 和 zoo 包）：

# your original code
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])

# assigning proper names to the columns
names(playoffs) <- c("shortcode","franchise","years","appearances")

# 1. shift the dat columnwise for the rows in which there is no shortcode
# 2. fill the resulting NA's with the last observation
# 3. only keep the last shortcode when the previous ones are the same
#    because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
                                            franchise = shortcode,
                                            years = franchise,
                                            appearances = years)
                ][, shortcode := na.locf(shortcode)
                  ][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]

【讨论】：

【解决方案2】：

这是一个答案。我冒昧地收集了数据

library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)

initial_data =
  "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
  readLines(encoding = "UTF-8") %>%
  readHTMLTable(stringsAsFactors = F) %>%
  `[[`(2) %>%
  mutate(ID = 1:n(),
         test =
           V1 %>%
           stri_detect_regex("^[A-Z]{2,3}$"))

variable_names = c("franchise",
                   "years",
                   "initial_postseason_appearances")

shifts = 
  initial_data %>%
  filter(test %>% `!`) %>%
  setNames(c(variable_names,
             "trash",
             "ID",
             "test"))

team_initial =
  initial_data %>%
  filter(test) %>%
  setNames(c("initial_abbreviation",
             variable_names,
             "ID",
             "test")) %>%
  bind_rows(shifts) %>%
  arrange(ID) %>%
  separate(years, c("start", "end")) %>%
  mutate(abbreviation = initial_abbreviation %>% na.locf,
         split_postseason_appearances =
           initial_postseason_appearances %>%
           plyr::mapvalues("–", NA) %>%
           stri_split_fixed(", ") )

appearance = 
  team_initial %>%
  select(franchise,
         split_postseason_appearances) %>%
  unnest(split_postseason_appearances) %>%
  mutate(postseason_appearance =
           split_postseason_appearances %>%
           extract_numeric) %>%
  select(-split_postseason_appearances)

team = 
  team_initial %>%
  select(abbreviation,
         franchise,
         start,
         end)

【讨论】：

【解决方案3】：

考虑一个 XML 包解决方案，需要使用 xpathSApply() 和 for 循环和 if/then 逻辑的各种 XPath 表达式。为了捕获跨行的表记录，使用了各种 XPath 字符串函数：string-length(), concat(), and substring():

library(XML)

# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)

# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()

# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))

for (i in (1:numberofteams+1)) {
  # TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
  if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {

    code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
  } else {
    # TR NODES W/O LETTER TEAM ABBREVIATION       
    code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
                                               //table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))        

  }
}

# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code), 
                       team = unlist(team), 
                       year = unlist(year), 
                       postseason = unlist(postseason))

【讨论】：

【解决方案4】：

试试 htmltab：

install.packages("htmltab")
library(htmltab)

purl <- htmlParse(url_source)
htmltab(purl, which = 2)

【讨论】：