【问题标题】:Script isn't retrieving all the info脚本未检索所有信息
【发布时间】:2021-06-18 07:53:45
【问题描述】:

我尝试制作一个 python 脚本,从 boxrec.com 获取所有战斗机名称和他们的记录。问题是它没有全部检索到它们(弗洛伊德·梅威瑟失踪),其中一些出现了多次(例如成功的 Tetteh)。

输出太大,无法在此处发布:https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/

编辑:对于一些拳手来说,记录是错误的(例如,瓦西尔·洛马琴科(Vasyl Lomachenko)似乎有 28 场胜利,但他有 14 场)

import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time


def main():
    fighter_names = []
    fighter_wins = []
    fighter_losses = []
    fighter_draws = []
    username = "username"
    password = "password"
    site = "https://boxrec.com/en/login"
    payload = {
        '_username': username,
        '_password': password,
        'login[go]': None
    }
    with Session() as s:
        s.get(site)
        s.post(site, data=payload, headers={
            "Content-Type": "application/x-www-form-urlencoded"
        })

        pages = numpy.arange(1, 19152, 20)
        for page in pages:
            page = s.get(
                "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                + str(page))
            soup = BeautifulSoup(page.text, 'html.parser')
            names_a = soup.find_all('a', class_='personLink')
            if not names_a:
                print("solving captcha")
                page = s.get(
                    "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                    "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                    + str(page))
                soup = BeautifulSoup(page.text, 'html.parser')
                names_a = soup.find_all('a', class_='personLink')
                pyautogui.click(x=118, y=1061)
                time.sleep(1)
                pyautogui.click(x=1035, y=619)
                time.sleep(2)
                pyautogui.click(x=97, y=59)
                time.sleep(1)
                pyautogui.click(x=834, y=247)
                time.sleep(2)
                if not names_a:
                    print("please solve captcha manually")
                while not names_a:
                    page = s.get(
                        "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                        "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                        + str(page))
                    soup = BeautifulSoup(page.text, 'html.parser')
                    names_a = soup.find_all('a', class_='personLink')
            wins_span = soup.find_all('span', class_='textWon')
            loses_span = soup.find_all('span', class_='textLost')
            draws_span = soup.find_all('span', class_='textDraw')
            for container in names_a:
                name = container.text
                print(name)
                fighter_names.append(name)

            for container in wins_span:
                wins = container.text
                fighter_wins.append(wins)

            for container in loses_span:
                losses = container.text
                fighter_losses.append(losses)

            for container in draws_span:
                draws = container.text
                fighter_draws.append(draws)

    fighters = {
        "name": fighter_names,
        "wins": fighter_wins,
        "loses": fighter_losses,
        "draws": fighter_draws
    }
    df = pd.DataFrame.from_dict(fighters, orient="index")
    df = df.transpose()
    df.to_csv("fighters.csv")


if __name__ == '__main__':
    main()

【问题讨论】:

    标签: python-3.x pandas csv web-scraping beautifulsoup


    【解决方案1】:

    我会避免使用相同的变量名称来表示 2 个不同的事物...看起来您在 2 个不同的实例中使用了 page 变量,这可能会造成混淆。

    就某些问题而言,我假设在某些时候列表中存在不匹配,因此相应的数据与正确的战斗机名称等不一致,或者网站的实际数据有问题/html。不完全确定,因为我还没有调试过。原因是,您是否考虑过使用 pandas 解析表格然后拆分 'w-l-d' 列?我认为让 pandas 进行解析会容易得多,以免错过您需要浏览的 900 多个页面中的某些内容。

    看看这是否有帮助:

    import numpy
    from requests import Session
    from bs4 import BeautifulSoup
    import pandas as pd
    import pyautogui
    import time
    import math
    
    
    def main():
        final_df = pd.DataFrame()
        username = 'username'
        password = 'password'
        site = "https://boxrec.com/en/login"
        payload = {
            '_username': username,
            '_password': password,
            'login[go]': None
        }
        with Session() as s:
            s.get(site)
            s.post(site, data=payload, headers={
                "Content-Type": "application/x-www-form-urlencoded"
            })
    
            pages = numpy.arange(1, 19152, 20)
            for page in pages:
                response = s.get(
                    "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                    "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                    + str(page))
                soup = BeautifulSoup(response.text, 'html.parser')
                names_a = soup.find_all('a', class_='personLink')
                if not names_a:
                    print("solving captcha")
                    response = s.get(
                        "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                        "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                        + str(page))
                    soup = BeautifulSoup(response.text, 'html.parser')
                    names_a = soup.find_all('a', class_='personLink')
                    pyautogui.click(x=118, y=1061)
                    time.sleep(1)
                    pyautogui.click(x=1035, y=619)
                    time.sleep(2)
                    pyautogui.click(x=97, y=59)
                    time.sleep(1)
                    pyautogui.click(x=834, y=247)
                    time.sleep(2)
                    if not names_a:
                        print("please solve captcha manually")
                    while not names_a:
                        response = s.get(
                            "https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
                            "%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
                            + str(page))
                        soup = BeautifulSoup(response.text, 'html.parser')
                        names_a = soup.find_all('a', class_='personLink')
                        
                df = pd.read_html(response.text)[-1] 
                df = df[['name','w-l-d']]
                df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")]  # <--- ADD THIS LINE
                df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
                df = df.drop('w-l-d', axis=1)
                
                print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
            
                final_df = final_df.append(df, sort=False).reset_index(drop=True)
    
        final_df.to_csv("fighters.csv")
    
    
    if __name__ == '__main__':
        main()
    

    【讨论】:

    • 成功了。但弗洛伊德梅威瑟仍然失踪。我检查了一下,似乎退休的拳击手失踪最多,我想我应该使用 boxrec 的另一个链接:boxrec.com/en/… 这是代码:bpa.st/LWMA 但它会引发此错误:bpa.st/ISYQ
    • 好的。稍后我会看看它,看看发生了什么。
    • 我编辑了代码。您需要添加 1 行来删除表格附带的任何非 w-l-d 记录。请参阅上面的编辑,我在其中添加了 1 行 df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")]
    猜你喜欢
    • 2014-08-09
    • 1970-01-01
    • 2017-04-08
    • 2016-01-15
    • 1970-01-01
    • 2015-01-27
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多