【问题标题】:Extracting html table with rowspan values提取具有行跨值的html表
【发布时间】:2015-12-06 19:52:47
【问题描述】:

我使用以下代码(使用 RCurlXML 包)创建的数据框仅将三个字母的团队缩写放在它跨越的第一行中。是否可以添加其他包或附加代码以将数据保留在正确的列中?

library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])

【问题讨论】:

    标签: xml r html-table rcurl


    【解决方案1】:

    你其实很接近。您唯一需要做的就是在正确的列和行中获取数据,因为某些行已向左移动。您可以按以下方式实现(借助 data.tablezoo 包):

    # your original code
    url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
    url_source <- readLines(url, encoding = "UTF-8")
    playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])
    
    # assigning proper names to the columns
    names(playoffs) <- c("shortcode","franchise","years","appearances")
    
    # 1. shift the dat columnwise for the rows in which there is no shortcode
    # 2. fill the resulting NA's with the last observation
    # 3. only keep the last shortcode when the previous ones are the same
    #    because only there the shortcode matches the franchise name
    library(data.table)
    library(zoo)
    setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
                                                franchise = shortcode,
                                                years = franchise,
                                                appearances = years)
                    ][, shortcode := na.locf(shortcode)
                      ][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]
    

    【讨论】:

      【解决方案2】:

      这是一个答案。我冒昧地收集了数据

      library(dplyr)
      library(XML)
      library(RCurl)
      library(stringi)
      library(zoo)
      library(tidyr)
      
      initial_data =
        "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
        readLines(encoding = "UTF-8") %>%
        readHTMLTable(stringsAsFactors = F) %>%
        `[[`(2) %>%
        mutate(ID = 1:n(),
               test =
                 V1 %>%
                 stri_detect_regex("^[A-Z]{2,3}$"))
      
      variable_names = c("franchise",
                         "years",
                         "initial_postseason_appearances")
      
      shifts = 
        initial_data %>%
        filter(test %>% `!`) %>%
        setNames(c(variable_names,
                   "trash",
                   "ID",
                   "test"))
      
      team_initial =
        initial_data %>%
        filter(test) %>%
        setNames(c("initial_abbreviation",
                   variable_names,
                   "ID",
                   "test")) %>%
        bind_rows(shifts) %>%
        arrange(ID) %>%
        separate(years, c("start", "end")) %>%
        mutate(abbreviation = initial_abbreviation %>% na.locf,
               split_postseason_appearances =
                 initial_postseason_appearances %>%
                 plyr::mapvalues("–", NA) %>%
                 stri_split_fixed(", ") )
      
      appearance = 
        team_initial %>%
        select(franchise,
               split_postseason_appearances) %>%
        unnest(split_postseason_appearances) %>%
        mutate(postseason_appearance =
                 split_postseason_appearances %>%
                 extract_numeric) %>%
        select(-split_postseason_appearances)
      
      team = 
        team_initial %>%
        select(abbreviation,
               franchise,
               start,
               end)
      

      【讨论】:

        【解决方案3】:

        考虑一个 XML 包解决方案,需要使用 xpathSApply()for 循环和 if/then 逻辑的各种 XPath 表达式。为了捕获跨行的表记录,使用了各种 XPath 字符串函数:string-length(), concat(), and substring():

        library(XML)
        
        # PARSE FROM URL
        url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
        webpage <- readLines(url)
        html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
        
        # INITIALIZE LISTS
        code <- c()
        team <- c()
        year <- c()
        postseason <- c()
        
        # APPEND TO LISTS LOOPING ACROSS ALL TEAMS
        numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))
        
        for (i in (1:numberofteams+1)) {
          # TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
          if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {
        
            code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
            team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
            year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
            postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
          } else {
            # TR NODES W/O LETTER TEAM ABBREVIATION       
            code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
                                                       //table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
            team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
            year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
            postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))        
        
          }
        }
        
        # COMBINE LISTS INTO DATA FRAME
        playoffs <- data.frame(code = unlist(code), 
                               team = unlist(team), 
                               year = unlist(year), 
                               postseason = unlist(postseason))
        

        【讨论】:

          【解决方案4】:

          试试 htmltab:

          install.packages("htmltab")
          library(htmltab)
          
          purl <- htmlParse(url_source)
          htmltab(purl, which = 2)
          

          【讨论】:

            猜你喜欢
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            • 2019-04-29
            • 2018-03-28
            • 2020-06-27
            • 2023-01-30
            相关资源
            最近更新 更多