【发布时间】:2017-08-29 07:25:56
【问题描述】:
我编写了以下代码来每天从门户网站中删除招标信息。
packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()
要废弃的主页并获取记录总数。
data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders')
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
All_tenders <- cbind(All_tenders,links_fair)
读取要获取的记录总数
Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div')
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1])))
用于清理和处理日期和因素等数据字段的函数。
process_dates <- function(data){
cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date')
date_processed_data <- data
date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm)
return(date_processed_data)
}
clean_process_data <- function(data){
cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type')
clean_processed_data <- data
clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor)
#clean_processed_data <- process_dates(clean_processed_data)
return(clean_processed_data)
}
下面的代码正是我的问题所在...
表报废从这里开始。第一页已经被报废以获取数据框的结构。
for (page_no in 2:round(Count_of_Recs/10)){
closeAllConnections()
on.exit(closeAllConnections())
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
url <- paste(url_bit1, page_no, sep="")
cat(page_no,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
这个 for 循环通常需要几个小时才能完成。 我正在寻找使用 apply 系列来获得良好的效果,以便节省时间。 该程序还负责获取和处理所有记录,然后再次处理每个单独的记录每次都报废一个全新的页面(此处未列出代码)....
我尝试了以下代码,但它没有给我想要的:
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
closeAllConnections()
on.exit(closeAllConnections())
url <- paste(url_bit1, datain$S.No., sep="")
cat(S.No.,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))
欢迎任何建议、指导、建议、意见或帮助。我只使用 R 3-4 个月。我也知道 Python 在这个问题上相对于 R 的优势,但我倾向于 R 来解决这个问题。
【问题讨论】: