使用 rvest 包跨多个页面抓取内容答案

【问题标题】：Webscraping content across multiple pages using rvest package使用 rvest 包跨多个页面抓取内容
【发布时间】：2016-03-17 20:50:47
【问题描述】：

我是一个非常新手的 R 程序员，但我一直在尝试使用 rvest 包从在线大学的网站上进行一些网页抓取。我从网页上抓取的第一个信息表是所有提供的博士级课程的列表。这是我的代码：

library(xml2)
library(httr)
library(rvest)
library(selectr)

刮痧博士生

fileUrl <- read_html("http://www.capella.edu/online-phd-programs/")

使用 chrome 中的选择器小工具工具，我能够选择网站上我想要提取的内容。在这种情况下，我选择了所有博士课程。

Degrees <- fileUrl %>%
html_nodes(".accordianparsys a") %>%
html_text() 
Degrees

接下来，我创建了一个博士学位的数据框。

Capella_Doctoral = data.frame(Degrees)

下面我正在创建另一个列，将这些程序标记为来自 Capella。

Capella_Doctoral$SchoolFlag <- "Capella" 
View(Capella_Doctoral)

在我上面的代码中，一切似乎都很好。但是，我想收集的下一类信息是每个博士课程的学费和学分。此信息存在于每个单独的博士课程的页面上。例如，领导力博士课程将在此页面“http://www.capella.edu/online-degrees/phd-leadership/”上包含学费和学分信息。会计课程 DBA 将在此页面“http://www.capella.edu/online-degrees/dba-accounting/”上包含学费和学分信息。各个页面的共同主题是在“online-degrees/”之后包含程序的名称。

为了创建我需要的各种网页列表（包括博士课程名称的网页），我开发了以下代码。

将博士学位格式化为小写，删除任何前导和尾随空格，然后用破折号替换任何空格

Lowercase <- tolower(Capella_Doctoral$Degrees) 
Lowercase

删除前导和尾随空格

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
Trim <- trim(Lowercase)
Trim

用破折号代替空格

Dashes <- gsub(" ", "-", Trim)
Dashes
Dashes2 <- gsub("---", "-", Dashes)
Dashes2

接下来，我将重新格式化的博士学位添加到以下网址的末尾，以获取我需要从每个课程的学费和学分时间中获取信息的所有可能网址的列表

urls <- rbindlist(sapply(Dashes2, function(x) {
    url <- paste("http://www.capella.edu/online-degrees/",x,"/", sep="")
    data.frame(url)
}), fill=TRUE)
Spec_URLs <- data.frame(urls)
View(Spec_URLs)

现在我已经列出了我需要从中获取信息的所有 url，我需要知道如何为每个 url 使用以下函数。下面的代码仅提取其中一个 URL 的学费和学分信息。如何让它遍历所有 URL？我的最终目标是将每个博士课程的所有学费和学分信息表放入一个数据框中。

fileUrl <- read_html("http://www.capella.edu/online-degrees/phd-leadership/")

Tuition <- fileUrl %>%
   html_nodes("p:nth-child(4) strong , .tooltip~ strong") %>%
   html_text() 
Tuition

结果：学费 [1]“120 积分”“每季度 4,665 美元”

【问题讨论】：

下面的解决方案有效吗？

标签： r web-scraping rvest

【解决方案1】：

这是一个快速而肮脏的....我希望它不会产生比答案更多的问题。本质上，此函数获取链接到部门的所有单个 url……然后对每个返回一个聚合数据对象执行相同的系列。在我们的例子中，一个有 82 行的数据框。如果你想清理它，你可以重新格式化列并清理一下 NA。希望它对你有用。

library(rvest)
library(stringi)
library(htmltools)
library(plyr)
library(dplyr)
library(DT)


# This is a helper function I threw on top..
txt.safe_text <- function(x){
  str_in <- iconv(x, "latin1", "ASCII", sub="")  %>%  stri_enc_toutf8()
  str_in %>%
    stri_replace_all_fixed('<U+0080><U+0093>',"'\\-'") %>%
    stri_enc_toascii %>% htmlEscape %>%
    stri_unescape_unicode %>%
    stri_replace_all_regex("\\032\\032\\032","-")%>%
    stri_replace_all_regex("\n","")
}




# Heres the iterator. I gave it zero args for purposes of the concept but you
# could add varible urls or filtering functions

parse.apella <- function(){


  # html() was deprecated but I use the older version of rvest so set the new name
  # to an alias for reproduction.
  read_html <- html


  # This is our index table. We are going to use this as a key to then qry all
  # other site info but keep a backref to the school variable and url
  idx_df <-
    lapply(read_html("http://www.capella.edu/online-phd-programs/") %>%
             html_nodes(".accordianparsys a"),function(i)
               data.frame(focus = html_text(i),
                          link = paste0("http://www.capella.edu", html_attr(i,"href"))
                          )) %>% rbind.pages

  # Threw this in for use case later with rendering a datatable and then being able to
  # jump straight to the site you are referencing.

  idx_df$html_output <- sapply(1:nrow(idx_df),function(i)
    htmltools::HTML(paste0(sprintf('<a href="%s">%s</a>',idx_df[i,2],idx_df[i,1]))))


  # Ok...so... for every index in our idx_df table above we are going to:
  # read site > parse the p html tags > pass a text cleaning function >
  # replace the leftovers eg:'\t' > split the string on the new line '\n'
  # character for easier user in building a data frame later > filter out all
  # returned data that has a character length of less than  or equal to 2 >
  # create a data frame with a filtering column in our loop.

  # Note: this is going to get the data for I think 84 websites..so give it a second
  # to run.

  A <- llply(1:nrow(idx_df),function(ii)
    lapply(read_html(idx_df[[2]][[ii]]) %>%
             html_nodes(".gernic_large_text > p") %>%
             html_text %>% txt.safe_text %>%
             stri_replace_all_regex("\t","\n") %>%
             strsplit("\n"),function(i)
               stri_split_regex(i,"  ") %>% unlist %>%
             data.frame(raw_txt = .) %>% filter(nchar(raw_txt)>2) %>%
             mutate(df_idx = 1:length(raw_txt),
                    school_name = idx_df[[1]][[ii]],
                    html_link = idx_df[[3]][[ii]])
    )
  )


  # Above we built a list of data frames...and the rule we know is that any information
  # we are interested in would produce at least two rows of data as we split
  # our raw html on the new line character. This means any data frame in our list
  # with 1 row is non-imporant but was easier to filter out than parse out earlier.
  # So we remove all those data frames with only 1 row.
  CC <- lapply(1:length(A),function(i)A[[i]][mapply(nrow,A[[i]]) == 2] %>% rbind.pages)


  # Helper function for looping through. I shouldn't have used numbers for the column names
  # but i'm just slapping this together.
  # This is going to essentially go through our data frames and transpose the structure
  # so that our final product is a wide data structure rather than a long.

  trans_df <- function(df_in = NULL,i){
    tmp_d <-
      as.data.frame(
        t(c(df_in[[i]][df_in[[i]][[2]] == 2,4][[1]],
            df_in[[i]][df_in[[i]][[2]] == 2,3][[1]],
            df_in[[i]][df_in[[i]][[2]] == 2,1]))
      )

    colnames(tmp_d) <-  c('html_link','school name',df_in[[i]][df_in[[i]][[2]] == 1,1])
    tmp_d
  }


  #  For every index in our list we're going to transpose our structures
  # And do some text cleaning and splitting
  all_dat <- ldply(1:length(CC),function(i)trans_df(df_in = CC,i)) %>%
    mutate(short_name = stri_extract_first_words(`school name`),
           Cost =
             ifelse(!is.na(Cost),
                    stri_extract_first_words(Cost),
                    'Not Listed')
           ) %>% mutate(program =
                   stri_replace_all_regex(
                     `school name`,
                     paste0('(',short_name,'| - )'),"") %>%
                   stri_trim_both) %>%
    mutate(next_session = as.Date(strptime(`Next Start Date`,"%b. %d,%Y"))) %>%
    mutate(Cost = as.numeric(gsub(",","",Cost))) %>% 
  select(html_link,
         short_name,
         program,
         cost = Cost,
         credit_hours = `Transfer Credits`,
         next_session,
         total_credits = `Total Quarter Credits`,
         session_length = `Course Length`)

  # Quick thing I noticed on the credit hours. Loop back over and
  # grab only the numeric values
  all_dat$credit_hours <-
    lapply(all_dat$credit_hours,function(i)
      stri_extract_all_regex(i,"[[:digit:]]") %>%
        unlist %>% paste0(collapse = "") %>% as.numeric) %>%
    unlist


  # Should be done
  return(all_dat)
}



rock.apella <- parse.apella()

str(rock.apella)
# 'data.frame':  82 obs. of  8 variables:
# $ html_link     : chr  "<a href=\"http://www.capella.edu/online-degrees/phd-leadership\">PHD - Leadership </a>"| __truncated__ ...
# $ short_name    : chr  "PHD" "PHD" "PHD" "PHD" ...
# $ program       : chr  "Leadership" "Information Technology Education" "General Information Technology" "Information Assurance and Security" ...
# $ cost          : num  4665 4665 4665 4665 4665 ...
# $ credit_hours  : num  32 32 48 32 32 32 32 32 48 32 ...
# $ next_session  : Date, format: "2016-04-11" "2016-04-11" "2016-04-11" "2016-04-11" ...
# $ total_credits : chr  "120 Credits" "120 Credits" "120 Credits" "120 Credits" ...
# $ session_length: chr  "10 weeks" "10 weeks" "10 weeks" "10 weeks" ...

DT::datatable(rock.apella,escape = F, options = list(searchHighlight = TRUE), filter = 'top')

这是我们的最终输出

以及 jsfiddle 中的输出https://jsfiddle.net/cbfas/0x37vudv/1/

【讨论】：