【问题标题】:add a field to a list of scraped tables using rvest使用 rvest 将字段添加到已抓取表的列表中
【发布时间】:2019-05-18 09:17:00
【问题描述】:

我已经成功地抓取了我想要的数据(在 SO 用户的帮助下),但是我错过了每个抓取的表中的数据代表谁的关键。因此,我尝试使用 mutate 添加一个名为 player 的字段,该字段与 player[[j]] 相同,但这不适用于列表。我已经阅读了有关 lapply 的内容并尝试过,但也没有成功。关于如何实现这一点的任何建议?

library(rvest)
library(plyr)
library(dplyr)
library(tidyr)


### get a list of players
page <- (0:18)
urls <- list()
for (i in 1:length(page)) {
  url<- paste0("https://www.mlssoccer.com/players?page=",page[i])
  urls[[i]] <- url
}

tbl <- list()
j <- 1
for (j in seq_along(urls)) {
  tbl[[j]] <- urls[[j]] %>%   
    read_html() %>% 
    html_nodes("a.name_link") %>%
    html_text()
  j <- j+1
  if (j == length(urls)) break
}

### join all of the names into one data frame
tbl <- ldply(tbl, data.frame)


player_tb<- as.data.frame(lapply(tbl, tolower))
colnames(player_tb) <- 'name'
player_table<- as.list(gsub(" ", "-", player_tb$name)) 
colnames(player_table) <- 'player'

#### using a list of players, get the game summary for each regular    season game, adding the player name to the table
pages<- list()
for( i in seq_along(player_table)) {
  page <- paste0("https://www.mlssoccer.com/players/",player_table[i])
  pages[[i]] <- page
}


player_stats <- list()
j <- 1
for (j in seq_along(pages)) {
  player_stats[[j]] <- pages[[j]] %>%   
    read_html() %>% 
    html_nodes("table") %>%
    html_table() %>%
    mutate(player = player)  ## this is the piece that fails
  j <- j+1                   
  if (j == length(pages)) break
}

t <- do.call(rbind, player_stats)

【问题讨论】:

    标签: r web-scraping dplyr rvest


    【解决方案1】:

    您可以尝试使用purrr 包来避免for 循环并加快处理速度

    使用purrr,您还可以拥有safelypossiblyquietly 这些非常酷的功能。有些玩家没有统计数据,您的代码失败。现在不会了

    这个想法是在一个大数据框中收集所有统计数据,并有一个带有玩家姓名的标识符列

    library(rvest)
    library(tidyverse)
    
    # lets assume 3 pages only to do it quickly
    page <- (0:2)
    
    # no need to create a list. Just a vector
    urls = paste0("https://www.mlssoccer.com/players?page=", page)
    
    # define this function that collects the player's name from a url
    get_the_names = function( url){
      url %>% 
        read_html() %>% 
        html_nodes("a.name_link") %>% 
        html_text()
    }
    
    # map the urls to the function that gets the names
    players = map(urls, get_the_names) %>% 
      # turn into a single character vector
      unlist() %>% 
      # make lower case
      tolower() %>% 
      # replace the `space` to underscore
      str_replace_all(" ", "-")
    
    
    # Now create a vector of player urls
    player_urls = paste0("https://www.mlssoccer.com/players/", players )
    
    # define a function that reads the 3rd table of the url
    get_the_summary_stats <-  function(url){
    
      url %>% 
        read_html() %>% 
        html_nodes("table") %>% 
        html_table() %>% .[[3]] 
    }
    
    # lets read 3 players only to speed things up [otherwise it takes a significant amount of time to run...]
    a_few_players = player_urls[1:3]
    
    # get the stats 
    tables = a_few_players %>% 
      # important step so I can name the rows I get in the table
      set_names() %>% 
      #map the player urls to the function that reads the 3rd table
      # note the `safely` wrap around the get_the_summary_stats' function
      # since there are players with no stats and causes an error (eg.brenden-aaronson )
      # the output will be a list of lists [result and error]
      map(., safely(get_the_summary_stats)) %>% 
      # collect only the `result` output (the table) INTO A DATA FRAME
      # There is also an `error` output
      # also, name each row with the players name
      map_df("result", .id = "player") %>% 
      #keep only the player name (remove the www.mls.... part)
      mutate(player = str_replace(player, "https://www.mlssoccer.com/players/", "")) %>% 
      as_tibble()
    

    让我们看看我们得到了多少

      tables %>% count(player)
    
    # A tibble: 2 x 2
      player                n
      <chr>             <int>
    1 anatole-abang        81
    2 saad-abdul-salaam   136
    

    现在您可以按玩家名称过滤数据框

      tables %>% 
      filter(player == "anatole-abang")
    
    # A tibble: 81 x 14
       player        Date       Match      Result Appearance  MINS     G     A  SHTS   SOG    FC    FS     Y     R
       <chr>         <chr>      <chr>      <chr>  <chr>      <int> <int> <int> <int> <int> <int> <int> <int> <int>
     1 anatole-abang 10/28/2018 ORL @ RBNY W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
     2 anatole-abang 10/21/2018 RBNY @ PHI W 1-0  Unused Sub     0     0     0     0     0     0     0     0     0
     3 anatole-abang 10/06/2018 RBNY @ SJ  W 3-1  Unused Sub     0     0     0     0     0     0     0     0     0
     4 anatole-abang 9/30/2018  ATL @ RBNY W 0-2  Unused Sub     0     0     0     0     0     0     0     0     0
     5 anatole-abang 9/22/2018  TOR @ RBNY W 0-2  Unused Sub     0     0     0     0     0     0     0     0     0
     6 anatole-abang 9/16/2018  RBNY @ DC  T 3-3  Unused Sub     0     0     0     0     0     0     0     0     0
     7 anatole-abang 9/01/2018  RBNY @ MTL L 0-3  Unused Sub     0     0     0     0     0     0     0     0     0
     8 anatole-abang 8/29/2018  HOU @ RBNY W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
     9 anatole-abang 8/26/2018  DC @ RBNY  W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
    10 anatole-abang 8/22/2018  RBNY @ NYC T 1-1  Unused Sub     0     0     0     0     0     0     0     0     0
    # ... with 71 more rows
    

    【讨论】:

    • 这很好用。谢谢你的时间。我以前没有使用过 purr 包。是否有一个 sys.sleep() 我可以使用 purr 包来实现更负责任的抓取?
    • 是的,有!在这里查看一些不错的技巧hvitfeldt.me/2018/01/purrr-tips-and-tricks
    • 我试过这个tables = a_few_players %&gt;% set_names() %&gt;% map(., ~{ Sys.sleep(5) safely(get_the_summary_stats) }) %&gt;% map_df("result", .id = "player") %&gt;% #keep only the player name (remove the www.mls.... part) mutate(player = str_replace(player, "https://www.mlssoccer.com/players/", "")) %&gt;% as_tibble() 但我得到一个错误Dont know how to pluck 有什么想法吗?
    【解决方案2】:

    您遇到的问题是由于玩家状态返回 4 个单独的表而不是一个。
    我已经稍微简化了您的代码,但不是最终解决方案,最终结果是列表列表。您现在可以在最终列表中使用lapply 来收集每个单独的表并在需要时将它们组合起来。

    library(rvest)
    library(dplyr)
    library(tidyr)
    
    ### get a list of players
    page <- (0:18)
    urls<- paste0("https://www.mlssoccer.com/players?page=",page)
    
    tbl <- list()
    for (j in seq_along(urls)) {
      tbl[[j]] <- urls[j] %>%   
        read_html() %>% 
        html_nodes("a.name_link") %>%
        html_text()
    #add a delay so not to overwhelm server
     Sys.sleep(0.75)
    }
    
    ### join all of the names into one data frame
    player_tb<- tolower(unlist(tbl))
    player_table <-data.frame(player= gsub(" ", "-", player_tb))
    
    #### using a list of players, get the game summary for each regular    season game, adding the player name to the table
    pages <- paste0("https://www.mlssoccer.com/players/",player_table$player)
    
    player_stats <- list()
    for (j in seq_along(pages)) {
      player_stats[[j]] <- pages[j] %>%   
        read_html() %>%   
        html_nodes("table") %>%
        html_table() 
      #determine if the status are present
      #bind player name to the table 
      if (length(ttables)==4){
        player_stats[[j]]<-cbind(player_table$player[j], ttables[[3]])
      } else {
        player_stats[[j]]<-cbind(player_table$player[j], ttables[[1]])
      }
      #add a delay so not to overwhelm server
      #get up and stretch your legs!
      Sys.sleep(0.75)  
    }
    #combine all of the player status into one dataframe
    finalanswer<-do.call(rbind, player_stats)
    

    此代码假定播放统计信息有 1 或 4 个与之关联的表,如果不是这样,则需要更改 if/else 语句以匹配。
    希望这对您有所帮助。

    【讨论】:

    • 感谢@Dave2e...非常有帮助。我看到了 4 张桌子的问题。我只想要第三张桌子。我试过player_stats &lt;- list() for (j in seq_along(pages)) { player_stats[[j]] &lt;- pages[j] %&gt;% read_html() %&gt;% html_node("table") %&gt;% .[3] %&gt;% html_table() } 但这不起作用。关于获取特定表的任何建议?
    • 谢谢@Dave2e - 感谢您的宝贵时间。我尝试了以下修改,包括 trycatch - 程序运行没有错误,但没有收集数据。 player_stats &lt;- list() for (j in seq_along(pages)) { tryCatch( player_stats[[j]] &lt;- pages[j] %&gt;% read_html() %&gt;% html_node("table") %&gt;% .[3] %&gt;% html_table(), error = function(e){0} ) }
    猜你喜欢
    • 1970-01-01
    • 2017-03-07
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2019-07-06
    • 1970-01-01
    相关资源
    最近更新 更多