【问题标题】:Looping in RSelenium and Scraping在 RSelenium 中循环和抓取
【发布时间】:2018-12-27 10:59:24
【问题描述】:

我正在尝试使用 RSelenium 从网站上抓取数据。我可以单独浏览下拉菜单,但是当我在循环中运行它们时会出错。

此外,在选择下拉列表中的所有值后,我想将设施名称和联系方式存储在表格中。到目前为止,我无法做到这一点。

rm(list=ls())
setwd("D:\\work_codes\\kvk\\data")
getwd()

library(RSelenium)
library(rvest)
library(XML)
library(RCurl)
library(magrittr)
library(stringr)

rd<-rsDriver()
remDr<-rd[["client"]]

remDr$navigate("https://kvk.icar.gov.in/facilities_list.aspx")

remDr$refresh()

stateEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlState")
states<-stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
states<-str_trim(states, 'left')
stateEle$clickElement()

for (i in 1:length(states)) {
  remDr$refresh()
  stateEle$clickElement()
  stateEle$sendKeysToElement(list(states[i]))
  stateEle$clickElement()
  districts<-NULL
  distEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlDistrict")
  districts<-distEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
  districts<-str_trim(districts, 'left')
  for (j in 1:length(districts)) {
    distEle$clickElement()
    distEle$sendKeysToElement(list(districts[j]))
    distEle$clickElement()
    kvk<-NULL
    kvkEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlKvk")
    kvk<-kvkEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
    kvk<-str_trim(kvk, 'left')
    for (k in 1:length(kvk)) {
      kvkEle$clickElement()
      kvkEle$sendKeysToElement(list(kvk[[1]]))
      kvkEle$clickElement()
      submitEle<-remDr$findElement("id", "ContentPlaceHolder1_btnSubmit")
      submitEle$clickElement()
      doc<-remDr$findElement('id', 'ContentPlaceHolder1_rptfacility_f_name_1')
      doc$getElementText()
      doc$clickElement()
      remDr$findElement('class name','Contact details:')
    }
  }
}

【问题讨论】:

    标签: for-loop rvest rselenium


    【解决方案1】:
    library(rvest)
    url<-"https://kvk.icar.gov.in/facilities_list.aspx"
    
    page<-html_session(url)
    form<-html_form(page)[[1]]
    
    states<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_attr("value")
    states<-states[-1]
    states_name<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_text()
    states_name<-states_name[-1]
    
    final_df<-0
    #### STATES LOOP ####
    for(i in 1:length(states)){
      filled_form<-set_values(form,
                              "ctl00$ContentPlaceHolder1$ddlState"=states[i])
      page1<-submit_form(page,filled_form)
      district<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_attr("value")
      district<-district[-1]
      district_name<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_text()
      district_name<-district_name[-1]
    
      #### DISTRICT LOOP ####
      for(j in 1:length(district)){
        filled_form1<-set_values(html_form(page1)[[1]],
                                "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                                "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j])
        page2<-submit_form(page1,filled_form1)
        kvk<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_attr("value")
        kvk<-kvk[-1]
        kvk_name<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_text()
        kvk_name<-kvk_name[-1]
    
        #### KVK LOOP ####
        for(k in 1:length(kvk)){
          filled_form2<-set_values(html_form(page2)[[1]],
                                   "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                                   "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j],
                                   "ctl00$ContentPlaceHolder1$ddlKvk"=kvk[k])
          page3<-submit_form(page2,filled_form2)
          contact_text<-gsub("[\r\n]","",html_nodes(page3,css=".panel-body") %>% html_text())
          if(length(contact_text) == 0){contact_text=""}
          df<-data.frame(cbind(states_name[i],district_name[j],kvk[k],contact_text))
          names(df)<-c("STATE","DISTRICT","KVK","CONTACT_TEXT")
          final_df[i*j*k] = list(df)
          ### WAITTIME TO AVOID HTTP 500 error - So the server is not overloaded
          sleep(5)
        }
      }
    }
    
    
    output_df<-data.table::rbindlist(final_df,fill=TRUE)
    
    # After this perform some string operations to extract the exact information required from the CONTACT_TEXT variable
    

    上面的答案没有使用任何 RSelenium 包,我认为这比 RSelenium 更值得信赖。

    【讨论】:

    • 另一个复选框是什么意思?
    • 没关系@Bharath。我可以通过对您的代码进行一些调整来抓取这些数据。但是,我在抓取其他数据时遇到了另一个问题。我已经发布了这个问题。如果你能帮我解决这个问题。
    猜你喜欢
    • 1970-01-01
    • 2019-04-24
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2020-04-19
    • 1970-01-01
    • 2015-02-03
    • 1970-01-01
    相关资源
    最近更新 更多