这是一个快速而肮脏的....我希望它不会产生比答案更多的问题。本质上,此函数获取链接到部门的所有单个 url……然后对每个返回一个聚合数据对象执行相同的系列。在我们的例子中,一个有 82 行的数据框。
如果你想清理它,你可以重新格式化列并清理一下 NA。希望它对你有用。
library(rvest)
library(stringi)
library(htmltools)
library(plyr)
library(dplyr)
library(DT)
# This is a helper function I threw on top..
txt.safe_text <- function(x){
str_in <- iconv(x, "latin1", "ASCII", sub="") %>% stri_enc_toutf8()
str_in %>%
stri_replace_all_fixed('<U+0080><U+0093>',"'\\-'") %>%
stri_enc_toascii %>% htmlEscape %>%
stri_unescape_unicode %>%
stri_replace_all_regex("\\032\\032\\032","-")%>%
stri_replace_all_regex("\n","")
}
# Heres the iterator. I gave it zero args for purposes of the concept but you
# could add varible urls or filtering functions
parse.apella <- function(){
# html() was deprecated but I use the older version of rvest so set the new name
# to an alias for reproduction.
read_html <- html
# This is our index table. We are going to use this as a key to then qry all
# other site info but keep a backref to the school variable and url
idx_df <-
lapply(read_html("http://www.capella.edu/online-phd-programs/") %>%
html_nodes(".accordianparsys a"),function(i)
data.frame(focus = html_text(i),
link = paste0("http://www.capella.edu", html_attr(i,"href"))
)) %>% rbind.pages
# Threw this in for use case later with rendering a datatable and then being able to
# jump straight to the site you are referencing.
idx_df$html_output <- sapply(1:nrow(idx_df),function(i)
htmltools::HTML(paste0(sprintf('<a href="%s">%s</a>',idx_df[i,2],idx_df[i,1]))))
# Ok...so... for every index in our idx_df table above we are going to:
# read site > parse the p html tags > pass a text cleaning function >
# replace the leftovers eg:'\t' > split the string on the new line '\n'
# character for easier user in building a data frame later > filter out all
# returned data that has a character length of less than or equal to 2 >
# create a data frame with a filtering column in our loop.
# Note: this is going to get the data for I think 84 websites..so give it a second
# to run.
A <- llply(1:nrow(idx_df),function(ii)
lapply(read_html(idx_df[[2]][[ii]]) %>%
html_nodes(".gernic_large_text > p") %>%
html_text %>% txt.safe_text %>%
stri_replace_all_regex("\t","\n") %>%
strsplit("\n"),function(i)
stri_split_regex(i," ") %>% unlist %>%
data.frame(raw_txt = .) %>% filter(nchar(raw_txt)>2) %>%
mutate(df_idx = 1:length(raw_txt),
school_name = idx_df[[1]][[ii]],
html_link = idx_df[[3]][[ii]])
)
)
# Above we built a list of data frames...and the rule we know is that any information
# we are interested in would produce at least two rows of data as we split
# our raw html on the new line character. This means any data frame in our list
# with 1 row is non-imporant but was easier to filter out than parse out earlier.
# So we remove all those data frames with only 1 row.
CC <- lapply(1:length(A),function(i)A[[i]][mapply(nrow,A[[i]]) == 2] %>% rbind.pages)
# Helper function for looping through. I shouldn't have used numbers for the column names
# but i'm just slapping this together.
# This is going to essentially go through our data frames and transpose the structure
# so that our final product is a wide data structure rather than a long.
trans_df <- function(df_in = NULL,i){
tmp_d <-
as.data.frame(
t(c(df_in[[i]][df_in[[i]][[2]] == 2,4][[1]],
df_in[[i]][df_in[[i]][[2]] == 2,3][[1]],
df_in[[i]][df_in[[i]][[2]] == 2,1]))
)
colnames(tmp_d) <- c('html_link','school name',df_in[[i]][df_in[[i]][[2]] == 1,1])
tmp_d
}
# For every index in our list we're going to transpose our structures
# And do some text cleaning and splitting
all_dat <- ldply(1:length(CC),function(i)trans_df(df_in = CC,i)) %>%
mutate(short_name = stri_extract_first_words(`school name`),
Cost =
ifelse(!is.na(Cost),
stri_extract_first_words(Cost),
'Not Listed')
) %>% mutate(program =
stri_replace_all_regex(
`school name`,
paste0('(',short_name,'| - )'),"") %>%
stri_trim_both) %>%
mutate(next_session = as.Date(strptime(`Next Start Date`,"%b. %d,%Y"))) %>%
mutate(Cost = as.numeric(gsub(",","",Cost))) %>%
select(html_link,
short_name,
program,
cost = Cost,
credit_hours = `Transfer Credits`,
next_session,
total_credits = `Total Quarter Credits`,
session_length = `Course Length`)
# Quick thing I noticed on the credit hours. Loop back over and
# grab only the numeric values
all_dat$credit_hours <-
lapply(all_dat$credit_hours,function(i)
stri_extract_all_regex(i,"[[:digit:]]") %>%
unlist %>% paste0(collapse = "") %>% as.numeric) %>%
unlist
# Should be done
return(all_dat)
}
rock.apella <- parse.apella()
str(rock.apella)
# 'data.frame': 82 obs. of 8 variables:
# $ html_link : chr "<a href=\"http://www.capella.edu/online-degrees/phd-leadership\">PHD - Leadership </a>"| __truncated__ ...
# $ short_name : chr "PHD" "PHD" "PHD" "PHD" ...
# $ program : chr "Leadership" "Information Technology Education" "General Information Technology" "Information Assurance and Security" ...
# $ cost : num 4665 4665 4665 4665 4665 ...
# $ credit_hours : num 32 32 48 32 32 32 32 32 48 32 ...
# $ next_session : Date, format: "2016-04-11" "2016-04-11" "2016-04-11" "2016-04-11" ...
# $ total_credits : chr "120 Credits" "120 Credits" "120 Credits" "120 Credits" ...
# $ session_length: chr "10 weeks" "10 weeks" "10 weeks" "10 weeks" ...
DT::datatable(rock.apella,escape = F, options = list(searchHighlight = TRUE), filter = 'top')
这是我们的最终输出
以及 jsfiddle 中的输出https://jsfiddle.net/cbfas/0x37vudv/1/