【发布时间】:2021-01-31 19:52:02
【问题描述】:
我正在尝试从 https://www.tefas.gov.tr/TarihselVeriler.aspx 中抓取土耳其基金市场数据。但是,数据不驻留在 HTML 标记中,所以我发布了一个带有表单的请求,以获取页面上的数据。
我可以通过正确的参数从第一页成功获取数据,但是,我无法遍历页面并从下一页获取数据,即使添加了
'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'
到第二个发帖请求表。我不确定根本原因,但我怀疑每次我发送一个发布请求时,它不是从同一个会话发送请求,而是将请求视为来自新会话。因此,为了解决这个问题,我使用来自的数据为发布请求设置了 cookie
tefas_session 但还是不行。您可以通过price_table1$FonKodu==price_table2$FonKodu 检查两个post 请求是否返回相同的数据。我原以为 price_table1 会从第一页带来数据,而 price_table2 会从第二页带来数据。
这是我目前写的代码:
library(rvest)
library(dplyr)
library(httr)
url<-"https://www.tefas.gov.tr/TarihselVeriler.aspx"
tefas_session<-html_session(
url,
httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36")
)
tefas_form<-tefas_session %>%
html_form()
fields<-tefas_form[[1]]$fields
#Arguments
fund_type<-'YAT' #Optional, default is 'YAT' which brings investment funds
fundId<-'' #Optional, default is empty string
TextBoxStartDate<-"15.01.2021"
TextBoxEndDate<-"30.01.2021"
first_page<-rvest:::request_POST(
x = tefas_session,
url = url,
httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"),
body=list(
#these 4 values contain session data
'__VIEWSTATE'=fields$'__VIEWSTATE'$value,
'__VIEWSTATEGENERATOR'=fields$'__VIEWSTATEGENERATOR'$value,
'__VIEWSTATEENCRYPTED'=fields$'__VIEWSTATEENCRYPTED'$value,
'__EVENTVALIDATION'=fields$'__EVENTVALIDATION'$value,
#this fields selects the type of a fund.
#'YAT' fetches investment funds, 'EMK' fetches retirement funds
'ctl00$MainContent$RadioButtonListFundMainType'=fund_type,
#These are supposed to be empty
'ctl00$MainContent$TextBoxOtherFund'='',
'ctl00$MainContent$TextBoxWatermarkExtenderFund_ClientState'='',
#this is optional. If left blank, it brings all funds. Else, it brings given fund
'ctl00$MainContent$HiddenFieldFundId'=fundId,
#See DropDownListExtraFundType
'ctl00$MainContent$DropDownListExtraFundType'=DropDownListExtraFundType[[1]],
#see DropDownListFundTypeExplanation
'ctl00$MainContent$DropDownListFundTypeExplanation'=DropDownListFundTypeExplanation[[1]],
#start date for data, minimum date
'ctl00$MainContent$TextBoxStartDate'=TextBoxStartDate,
'ctl00$MainContent$TextBoxWatermarkExtenderStartDate_ClientState'='',
#last date for data, maximum date
'ctl00$MainContent$TextBoxEndDate'=TextBoxEndDate,
'ctl00$MainContent$ButtonSearchDates'='Görüntüle',
'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ButtonSearchDates'
),
encode="form",
set_cookies('ASP.NET_SessionId'=session_cookies$value[1],
'TS01ec1a88'=session_cookies$value[2],
'TS392566ef027'=session_cookies$value[3]))
price_table1<-read_html(first_page) %>%
html_nodes(xpath = '//*[@id="MainContent_GridViewGenel"]') %>%
html_table() %>%
as.data.frame() %>%
mutate(
across(
where(is.character),
function(x) { gsub(pattern = ",",replacement = ".",x = x) })
)
#To-Do
# Find a way to traverse through pages.
next_page<-'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'
second_page<-rvest:::request_POST(
x = tefas_session,
url = url,
httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"),
body=list(
#TO-DO: ENTER ALL FORM VALUES
#these 4 values are used for cookies
'__VIEWSTATE'=fields$'__VIEWSTATE'$value,
'__VIEWSTATEGENERATOR'=fields$'__VIEWSTATEGENERATOR'$value,
'__VIEWSTATEENCRYPTED'=fields$'__VIEWSTATEENCRYPTED'$value,
'__EVENTVALIDATION'=fields$'__EVENTVALIDATION'$value,
'__EVENTTARGET'=next_page,
#this fields selects the type of a fund.
#'YAT' fetches investment funds, 'EMK' fetches retirement funds
'ctl00$MainContent$RadioButtonListFundMainType'=fund_type,
#These are supposed to be empty
'ctl00$MainContent$TextBoxOtherFund'='',
'ctl00$MainContent$TextBoxWatermarkExtenderFund_ClientState'='',
#this is optional. If left blank, it brings all funds. Else, it brings given fund
'ctl00$MainContent$HiddenFieldFundId'=fundId,
#See DropDownListExtraFundType
'ctl00$MainContent$DropDownListExtraFundType'=DropDownListExtraFundType[[1]],
#see DropDownListFundTypeExplanation
'ctl00$MainContent$DropDownListFundTypeExplanation'=DropDownListFundTypeExplanation[[1]],
#start date for data, minimum date
'ctl00$MainContent$TextBoxStartDate'=TextBoxStartDate,
'ctl00$MainContent$TextBoxWatermarkExtenderStartDate_ClientState'='',
#last date for data, maximum date
'ctl00$MainContent$TextBoxEndDate'=TextBoxEndDate,
'ctl00$MainContent$ButtonSearchDates'='Görüntüle',
'ctl00$MainContent$ScriptManager1'='ctl00$MainContent$UpdatePanel1|ctl00$MainContent$ImageButtonGenelNext'
),
encode='form',
set_cookies('ASP.NET_SessionId'=session_cookies$value[1],
'TS01ec1a88'=session_cookies$value[2],
'TS392566ef027'=session_cookies$value[3]))
price_table2<-read_html(second_page) %>%
html_nodes(xpath = '//*[@id="MainContent_GridViewGenel"]') %>%
html_table() %>%
as.data.frame() %>%
mutate(
across(
where(is.character),
function(x) { gsub(pattern = ",",replacement = ".",x = x) })
)
price_table1$FonKodu==price_table2$FonKodu
【问题讨论】:
标签: r web-scraping rvest httr