【问题标题】:selenium web browser error for multiple urls多个网址的硒网络浏览器错误
【发布时间】:2021-06-24 16:51:02
【问题描述】:

此代码适用于单个 url,但有多个,我得到错误

import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs

start = datetime.now()
browser = webdriver.Chrome()

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []


def parse_data(url):
    browser.get(url)
    df = pd.read_html(browser.page_source, header=0)[0]
    html = browser.page_source
    soup = bs(html,"lxml")
    cont = soup.find('div', {'id':'wrap'})
    conti = cont.find('div', {'id':'col-content'})
    content = conti.find('table', {'class':'table-main'}, {'id':'tournamentTable'})
    main = content.find('th', {'class':'first2 tl'})
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
    browser.quit()
    return game_data, country, league

# You can input as many URLs you want
urls = {
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/belgium/jupiler-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/czech-republic/1-liga/results/#/page/1/",
}

if __name__ == '__main__':

    results = None

    for url in urls:
        game_data, country, competition = parse_data(url)
        result = pd.DataFrame(game_data.__dict__)
        result['country'] = country
        result['competition'] = competition
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

print(tabulate(results.head(), headers='keys', tablefmt="github"))
end = datetime.now()
time_taken = end - start
print('Time taken to complete: ', time_taken)

浏览器不循环到下一个网址并关闭

raise MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54008): 最大重试次数超出了 url: /session/6c9f4ce81beb95e93f2cc32858dfb114/url (由 NewConnectionError(':无法建立新连接:[WinError 10061] 由于目标机器主动拒绝,无法建立连接'))

上面没有return game_data, country, league的代码可以完美运行

from selenium import webdriver
import pandas as pd

browser = webdriver.Chrome()

class GameData:

    def __init__(self):
        self.dates = []
        self.games = []
        self.scores = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []


def parse_data(url):
    browser.get(url)
    df = pd.read_html(browser.page_source, header=0)[0]
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.dates.append(game_date)
        game_data.games.append(row[2])
        game_data.scores.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])

    return game_data


urls = {"https://www.oddsportal.com/soccer/australia/a-league/results/",
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/europe/europa-league/results/"}

if __name__ == '__main__':

    results = None

    for url in urls:
        game_data = parse_data(url)
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

如何为多个 url 循环它?

【问题讨论】:

  • 这似乎是网络抓取/抓取脚本。您的错误urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54008): Max retries exceeded with url... 表示目标机器(也就是为您提供网页的服务器)拒绝您的连接;可能作为反刮/爬措施
  • @Kristian 当我尝试更新competitioncountry 时,代码开始出现错误
  • 正如@Kristian 所说,该网站可能正在积极尝试阻止您尝试做的事情。毕竟,它是受版权保护的材料。
  • 会不会是dataframe调用/构造不正确?

标签: python selenium-webdriver


【解决方案1】:

问题是我没有正确定义数据框并构建它 browser.close 也是在循环完成之前,因此浏览器在下一个 url 之前关闭。

import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time

from bs4 import BeautifulSoup as bs

browser = webdriver.Chrome()


class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def parse_data(url):
    browser.get(url)
    df = pd.read_html(browser.page_source, header=0)[0]
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data


urls = {
    "https://www.oddsportal.com/soccer/europe/champions-league/results/",
    "https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
    "https://www.oddsportal.com/soccer/belgium/jupiler-league/results/#/page/1/",
    "https://www.oddsportal.com/soccer/czech-republic/1-liga/results/#/page/1/",
}

if __name__ == '__main__':

    results = None

    for url in urls:
        game_data = parse_data(url)
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

print(tabulate(results, headers='keys', tablefmt="github"))

【讨论】:

    猜你喜欢
    • 2014-09-23
    • 1970-01-01
    • 2012-10-30
    • 2019-06-19
    • 1970-01-01
    • 1970-01-01
    • 2013-03-19
    • 1970-01-01
    • 2017-04-16
    相关资源
    最近更新 更多