【发布时间】:2021-06-24 16:51:02
【问题描述】:
此代码适用于单个 url,但有多个,我得到错误
import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs
start = datetime.now()
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html,"lxml")
cont = soup.find('div', {'id':'wrap'})
conti = cont.find('div', {'id':'col-content'})
content = conti.find('table', {'class':'table-main'}, {'id':'tournamentTable'})
main = content.find('th', {'class':'first2 tl'})
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
browser.quit()
return game_data, country, league
# You can input as many URLs you want
urls = {
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/belgium/jupiler-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/czech-republic/1-liga/results/#/page/1/",
}
if __name__ == '__main__':
results = None
for url in urls:
game_data, country, competition = parse_data(url)
result = pd.DataFrame(game_data.__dict__)
result['country'] = country
result['competition'] = competition
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(tabulate(results.head(), headers='keys', tablefmt="github"))
end = datetime.now()
time_taken = end - start
print('Time taken to complete: ', time_taken)
浏览器不循环到下一个网址并关闭
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54008): 最大重试次数超出了 url: /session/6c9f4ce81beb95e93f2cc32858dfb114/url (由 NewConnectionError('
上面没有return game_data, country, league的代码可以完美运行
from selenium import webdriver
import pandas as pd
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.dates = []
self.games = []
self.scores = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.dates.append(game_date)
game_data.games.append(row[2])
game_data.scores.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
urls = {"https://www.oddsportal.com/soccer/australia/a-league/results/",
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/europe/europa-league/results/"}
if __name__ == '__main__':
results = None
for url in urls:
game_data = parse_data(url)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
如何为多个 url 循环它?
【问题讨论】:
-
这似乎是网络抓取/抓取脚本。您的错误
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54008): Max retries exceeded with url...表示目标机器(也就是为您提供网页的服务器)拒绝您的连接;可能作为反刮/爬措施 -
@Kristian 当我尝试更新
competition和country时,代码开始出现错误 -
正如@Kristian 所说,该网站可能正在积极尝试阻止您尝试做的事情。毕竟,它是受版权保护的材料。
-
会不会是dataframe调用/构造不正确?