【问题标题】:How to POST request multiple times in the same session using rvest and httr?如何使用 rvest 和 httr 在同一个会话中多次发布请求?
【发布时间】:2021-01-31 19:52:02
【问题描述】:

我正在尝试从 https://www.tefas.gov.tr/TarihselVeriler.aspx 中抓取土耳其基金市场数据。但是,数据不驻留在 HTML 标记中,所以我发布了一个带有表单的请求,以获取页面上的数据。

我可以通过正确的参数从第一页成功获取数据,但是,我无法遍历页面并从下一页获取数据,即使添加了

'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'

到第二个发帖请求表。我不确定根本原因,但我怀疑每次我发送一个发布请求时,它不是从同一个会话发送请求,而是将请求视为来自新会话。因此,为了解决这个问题,我使用来自的数据为发布请求设置了 cookie tefas_session 但还是不行。您可以通过price_table1$FonKodu==price_table2$FonKodu 检查两个post 请求是否返回相同的数据。我原以为 price_table1 会从第一页带来数据,而 price_table2 会从第二页带来数据。

这是我目前写的代码:


library(rvest)
library(dplyr)
library(httr)

url<-"https://www.tefas.gov.tr/TarihselVeriler.aspx"

tefas_session<-html_session(
    url,
    httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36")
)

tefas_form<-tefas_session %>%
    html_form()

fields<-tefas_form[[1]]$fields

#Arguments

fund_type<-'YAT' #Optional, default is 'YAT' which brings investment funds

fundId<-'' #Optional, default is empty string


TextBoxStartDate<-"15.01.2021"
TextBoxEndDate<-"30.01.2021"




first_page<-rvest:::request_POST(
    x = tefas_session,
    url = url,
    httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"),
    body=list(
        #these 4 values contain session data
        '__VIEWSTATE'=fields$'__VIEWSTATE'$value,
        '__VIEWSTATEGENERATOR'=fields$'__VIEWSTATEGENERATOR'$value,
        '__VIEWSTATEENCRYPTED'=fields$'__VIEWSTATEENCRYPTED'$value,
        '__EVENTVALIDATION'=fields$'__EVENTVALIDATION'$value,
        
        #this fields selects the type of a fund.
        #'YAT' fetches investment funds, 'EMK' fetches retirement funds
        'ctl00$MainContent$RadioButtonListFundMainType'=fund_type,
        
        #These are supposed to be empty
        'ctl00$MainContent$TextBoxOtherFund'='',
        'ctl00$MainContent$TextBoxWatermarkExtenderFund_ClientState'='',
        
        #this is optional. If left blank, it brings all funds. Else, it brings given fund
        'ctl00$MainContent$HiddenFieldFundId'=fundId,
        
        #See DropDownListExtraFundType
        'ctl00$MainContent$DropDownListExtraFundType'=DropDownListExtraFundType[[1]],
        
        #see DropDownListFundTypeExplanation
        'ctl00$MainContent$DropDownListFundTypeExplanation'=DropDownListFundTypeExplanation[[1]],
        
        #start date for data, minimum date
        'ctl00$MainContent$TextBoxStartDate'=TextBoxStartDate,
        
        'ctl00$MainContent$TextBoxWatermarkExtenderStartDate_ClientState'='',
        
        #last date for data, maximum date
        'ctl00$MainContent$TextBoxEndDate'=TextBoxEndDate,
        
        'ctl00$MainContent$ButtonSearchDates'='Görüntüle',
        
        'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ButtonSearchDates'
    ),
    encode="form",
    set_cookies('ASP.NET_SessionId'=session_cookies$value[1],
                'TS01ec1a88'=session_cookies$value[2],
                'TS392566ef027'=session_cookies$value[3]))




price_table1<-read_html(first_page) %>%
    html_nodes(xpath = '//*[@id="MainContent_GridViewGenel"]') %>%
    html_table() %>%
    as.data.frame() %>%
    mutate(
        across(
            where(is.character),
            function(x) { gsub(pattern = ",",replacement = ".",x = x) })
    )



#To-Do
# Find a way to traverse through pages.

next_page<-'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'

second_page<-rvest:::request_POST(
    x = tefas_session,
    url = url,
    httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"),
    body=list(
        #TO-DO: ENTER ALL FORM VALUES
        #these 4 values are used for cookies
        '__VIEWSTATE'=fields$'__VIEWSTATE'$value,
        '__VIEWSTATEGENERATOR'=fields$'__VIEWSTATEGENERATOR'$value,
        '__VIEWSTATEENCRYPTED'=fields$'__VIEWSTATEENCRYPTED'$value,
        '__EVENTVALIDATION'=fields$'__EVENTVALIDATION'$value,
        
        '__EVENTTARGET'=next_page,
        
        #this fields selects the type of a fund.
        #'YAT' fetches investment funds, 'EMK' fetches retirement funds
        'ctl00$MainContent$RadioButtonListFundMainType'=fund_type,
        
        #These are supposed to be empty
        'ctl00$MainContent$TextBoxOtherFund'='',
        'ctl00$MainContent$TextBoxWatermarkExtenderFund_ClientState'='',
        
        #this is optional. If left blank, it brings all funds. Else, it brings given fund
        'ctl00$MainContent$HiddenFieldFundId'=fundId,
        
        #See DropDownListExtraFundType
        'ctl00$MainContent$DropDownListExtraFundType'=DropDownListExtraFundType[[1]],
        
        #see DropDownListFundTypeExplanation
        'ctl00$MainContent$DropDownListFundTypeExplanation'=DropDownListFundTypeExplanation[[1]],
        
        #start date for data, minimum date
        'ctl00$MainContent$TextBoxStartDate'=TextBoxStartDate,
        
        'ctl00$MainContent$TextBoxWatermarkExtenderStartDate_ClientState'='',
        
        #last date for data, maximum date
        'ctl00$MainContent$TextBoxEndDate'=TextBoxEndDate,
        
        'ctl00$MainContent$ButtonSearchDates'='Görüntüle',
        
        'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'
    ),
    encode='form',
    set_cookies('ASP.NET_SessionId'=session_cookies$value[1],
                'TS01ec1a88'=session_cookies$value[2],
                'TS392566ef027'=session_cookies$value[3]))

price_table2<-read_html(second_page) %>%
    html_nodes(xpath = '//*[@id="MainContent_GridViewGenel"]') %>%
    html_table() %>%
    as.data.frame() %>%
    mutate(
        across(
            where(is.character),
            function(x) { gsub(pattern = ",",replacement = ".",x = x) })
    )


price_table1$FonKodu==price_table2$FonKodu

【问题讨论】:

    标签: r web-scraping rvest httr


    【解决方案1】:

    我不知道我是否正确理解了这个问题,但解决方案可能是RSeleniumrvest

    下面是一个关于如何从 Genel Bilger 部分的表格中获取信息的简单示例。

    library(RSelenium)
    library(rvest)
    driver <- rsDriver(browser= 'firefox', port = 4532L)
    remote_driver <- driver[["client"]] 
    remote_driver$navigate("https://www.tefas.gov.tr/TarihselVeriler.aspx")
    
    empty_df<-list()
    for(i in 1:100){ #I don't know how many pages has the site
    #we check if there is the bottom next
    tryCatch(expr = { #this to manage the error
      html_page<-remote_driver$getPageSource() %>% unlist() %>% read_html() %>% html_table( fill=TRUE) #this to get the table
      next_bottom<-remote_driver$findElement(using = 'xpath',value ='//*[@id="MainContent_ImageButtonGenelNext"]')$clickElement() #this to go next
      Sys.sleep(3)
      empty_df[[i]] <-(html_page[[4]]) #this to save the table that we got
    
      },
      error = function(e){          # Specifying error message
               message("There was an error message.")}) #this to know when we have the error
    } 
    

    【讨论】:

    • 您好,谢谢您的回答!这有效,但实际上并不能解决我的问题。我希望刮板在没有任何浏览器的情况下运行(因此不使用 Rselenium)。那么有没有不使用 Rselenium 并实现 rvest 或 httr 的解决方案?
    • 我不相信没有 Selenium 可以切换页面。
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2014-10-26
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多