【发布时间】:2021-03-13 20:58:16
【问题描述】:
我刚刚开始使用网络抓取,并认为我使用下面的脚本和 Beautiful Soup 来解析简单的雅虎财务数据取得了良好的进展。下面的脚本效果很好,但它只返回了 100 行,即使我请求了一年的值。我发现一些 SO 帖子建议在请求中添加一个数据参数,将 AJAX 和移动设置为 no,但这也不起作用。我还尝试传递不同的标题信息,但没有这样做。 Beautiful soup 是否有一个我遗漏的论点会返回完整列表?当我从请求中打印出完整的 HTML 内容时,完整的结果就在那里,所以我很难过。
from datetime import datetime, timedelta
import time
import requests
from bs4 import BeautifulSoup
def format_date_int_as_str(date_datetime):
date_timetuple = date_datetime.timetuple()
date_mktime = time.mktime(date_timetuple)
date_int = int(date_mktime)
date_str = str(date_int)
return date_str
def subdomain(symbol, start, end, filter='history'):
subdoma="/quote/{0}/history?period1={1}&period2={2}&interval=1d&filter={3}&frequency=1d"
subdomain = subdoma.format(symbol, start, end, filter)
return subdomain
def header_function(subdomain):
hdrs = {"authority": "finance.yahoo.com",
"method": "GET",
"path": subdomain,
"scheme": "https",
"accept": "text/html",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"dnt": "1",
"pragma": "no-cache",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64)"}
return hdrs
if __name__ == '__main__':
symbol = 'AAPL'
dt_start = datetime.today() - timedelta(days=365)
dt_end = datetime.today()- timedelta(days=100)
start = format_date_int_as_str(dt_start)
end = format_date_int_as_str(dt_end)
sub = subdomain(symbol, start, end)
header = header_function(sub)
url = "https://finance.yahoo.com" + sub
print("\nREQUESTING: " + url + "\n" + str(dt_start) + " to " + str(dt_end))
# Build HTML request and content for BS
r = requests.get (url, headers=header)
c = r.content
classNameTr = "BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)"
classNameDt = "Py(10px) Ta(start) Pend(10px)"
classNameTd = "Py(10px) Pstart(10px)"
className = " Pb(10px) Ovx(a) W(100%)"
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all( "tr", {"class": classNameTr} )
print("LIST LENGTH: " + str(len(all)))
# https://finance.yahoo.com/quote/AAPL/history?period1=1572244075&period2=1615447675&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true
# https://finance.yahoo.com/quote/AAPL/history?period1=1572244075&period2=1615447675&interval=1d&filter=history&frequency=1d
# Why aren't we getting full look back list ... check to see if its in the HTML just not the soup? Does soup have 100 array item limit?
for stockDay in all:
dt = stockDay.find( "td", {"class": classNameDt} )
td = stockDay.find_all( "td", {"class": classNameTd} )
if len(td) == 6 :
print( dt.text + " --|-- OPEN:" + td[0].text + " --|-- HIGH:" + td[1].text + " --|-- LOW:" + td[2].text + " --|-- CLOSE:" + td[3].text + " --|-- ADJCLOSE:" + td[4].text + " --|-- VOLUME:" + td[5].text + " --|-- ")
else:
print(dt.text + " --|-- Skipping-- this is the dividend date!")
【问题讨论】:
标签: python web-scraping beautifulsoup yahoo-finance