【发布时间】:2021-06-18 07:53:45
【问题描述】:
我尝试制作一个 python 脚本,从 boxrec.com 获取所有战斗机名称和他们的记录。问题是它没有全部检索到它们(弗洛伊德·梅威瑟失踪),其中一些出现了多次(例如成功的 Tetteh)。
输出太大,无法在此处发布:https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
编辑:对于一些拳手来说,记录是错误的(例如,瓦西尔·洛马琴科(Vasyl Lomachenko)似乎有 28 场胜利,但他有 14 场)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
【问题讨论】:
标签: python-3.x pandas csv web-scraping beautifulsoup