有两点让这个网站很难单独使用 beautifulsoup 来抓取:
-
数据不是 html 中的纯文本。它位于div,ID 为tournament-page-data-results,如下所示:
SA÷1¬~ZA÷ENGLAND: Premier League¬ZEE÷dYlOSQOD¬ZB÷198¬ZY÷England¬ZC÷zTRyeuJg¬ZD÷t¬ZE÷AJuiuwWt¬ZF÷0¬ZO÷0¬ZG÷1¬ZH÷198_dYlOSQOD¬ZJ÷2¬ZL÷/en/soccer/england/premier-league/¬ZX÷00England 007ngland0000000000001000Premier Leag014League000¬ZCC÷0¬ZAF÷England¬~AA÷6J0L2p0r¬AD÷1601835300¬ADE÷1601835300¬AB÷3¬CR÷3¬AC÷3¬CX÷Aston Villa¬ER÷Round 4¬RW÷0¬AX÷1¬AO÷1601842143¬BX÷-1¬HMC÷1¬WQ÷¬WN÷LIV¬AF÷Liverpool¬JB÷Yi2C1SGu¬WV÷liverpool¬AH÷2¬BB÷1¬BD÷1¬WM÷AST¬AE÷Aston Villa¬JA÷f3v9dzKU¬WU÷aston-villa¬AS÷1¬AZ÷1¬AG÷7¬BA÷4¬BC÷3¬AW÷1¬~AA÷l2dtbMED¬AD÷1601825400¬ADE÷1601825400¬AB÷3¬CR÷3¬AC÷3¬CX÷Manchester Utd¬ER÷Round 4¬RW÷0¬AX÷1¬AO÷1601832194¬BX÷-1¬HMC÷1¬WQ÷¬WN÷TOT¬AF÷Tottenham¬JB÷IHkhE50o¬WV÷tottenham¬AS÷2¬AZ÷2¬AH÷6¬BB÷4¬BD÷2¬WM÷MNU¬AE÷Manchester Utd¬JA÷U1dAkMNp¬WU÷manchester-united¬AJ÷1¬AG÷1¬BA÷1¬BC÷0¬AW÷1¬~AA÷0xOh7QiR¬AD÷1601816400¬ADE÷1601816400¬AB÷3¬CR÷3¬AC÷3¬CX÷Arsenal¬ER÷Round 4¬RW÷0¬AX÷1¬AO÷1601823089¬BX÷-1¬HMC÷1¬WQ÷¬WM÷ARS¬AE÷Arsenal¬JA÷MyR1bdkI¬WU÷arsenal¬AS÷1¬AZ÷1¬AG÷2¬BA÷0¬BC÷2¬WN÷SHU¬AF÷Sheffield Utd¬JB÷GCu5cG4O¬WV÷sheffield-utd¬AH÷1¬BB÷0¬BD÷1¬AW÷1¬~AA÷rFhWKqMQ¬AD÷1601816400¬ADE÷1601816400¬AB÷3¬CR÷3¬AC÷3¬C
这是一种自定义格式,使用~ 作为行分隔符,¬ 作为单元格分隔符,÷ 用于分隔键/值。可以通过查看js来推断逻辑(以core_*.js开头的文件。为了在python中解析它,我们需要重现逻辑并添加标签的所有匹配项(例如ZA是@987654331 @ 等)
- 第二件事是,要进入统计页面,您需要在同一文件
core_*.js 中用 JS 硬编码的令牌。
使用正则表达式从 JS 中提取令牌表明在这种情况下您可能会更好地使用 selenium
以下代码提取匹配数据,解析自定义格式,获取js文件,从中提取token,生成stats url并获取stats html:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
DELIMITER_ROW = "~"
DELIMITER_CELL = "¬"
DELIMITER_VALUE = "÷"
indexes = {
"COMMONINDEXES_AWAY_FIRST_OUTS": 'ER',
"COMMONINDEXES_FT_WINNER": 'AZ',
"COMMONINDEXES_ROW": 'RW',
"FSCORE_DRAWINDEXES_ROUND_ADVANCING_PARTICIPANT": 'AE',
"FULLFEEDINDEXES_AWAY_3CHAR_NAME": 'WN',
"FULLFEEDINDEXES_AWAY_EVENT_PARTICIPANT_ID": 'JB',
"FULLFEEDINDEXES_AWAY_PARTICIPANT_NAME": 'AF',
"FULLFEEDINDEXES_AWAY_PARTICIPANT_NAME_URL": 'WV',
"FULLFEEDINDEXES_EVENT_START_UTIME": 'ADE',
"FULLFEEDINDEXES_HAS_MATCH_COMMENTS": 'HMC',
"FULLFEEDINDEXES_HOME_3CHAR_NAME": 'WM',
"FULLFEEDINDEXES_HOME_EVENT_PARTICIPANT_ID": 'JA',
"FULLFEEDINDEXES_HOME_PARTICIPANT_NAME_URL": 'WU',
"FULLFEEDINDEXES_SORT_PARTICIPANT": 'CX',
"FULLFEEDINDEXES_WINNER": 'AS',
"SHAREDINDEXES_AWAY_CURRENT_RESULT": 'AH',
"SHAREDINDEXES_AWAY_RESULT_PERIOD_1": 'BB',
"SHAREDINDEXES_AWAY_RESULT_PERIOD_2": 'BD',
"SHAREDINDEXES_CRICKET_RECENT_OVERS": 'WQ',
"SHAREDINDEXES_EVENT_ID": 'AA',
"SHAREDINDEXES_EVENT_STAGE_ID": 'AC',
"SHAREDINDEXES_EVENT_STAGE_TYPE_FROM_EVENT_STAGE_ID": 'CR',
"SHAREDINDEXES_EVENT_STAGE_TYPE_ID": 'AB',
"SHAREDINDEXES_GAME_TIME": 'BX',
"SHAREDINDEXES_HAS_LINEUPS": 'AX',
"SHAREDINDEXES_HAS_LIVE_CENTRE": 'AW',
"SHAREDINDEXES_HOME_CURRENT_RESULT": 'AG',
"SHAREDINDEXES_HOME_RESULT_PERIOD_1": 'BA',
"SHAREDINDEXES_HOME_RESULT_PERIOD_2": 'BC',
"SHAREDINDEXES_MATCH_START_UTIME": 'AD',
"SHAREDINDEXES_PERIOD_START_UTIME": 'AO',
"SHAREDINDEXES_SPORT_ID": 'SA',
"SHAREDINDEXES_TOURNAMENT_NAME": 'ZA',
"LEAGUEINDEXES_COUNTRY_ID": "ZB",
"FULLFEEDINDEXES_TOURNAMENT_TEMPLATE_ID": "ZEE",
"LEAGUEINDEXES_COUNTRY_NAME": "ZY",
"SHAREDINDEXES_TOURNAMENT_STAGE_ID": "ZC",
"LEAGUEINDEXES_TOURNAMENT_TYPE": "ZD",
"LEAGUEINDEXES_TOURNAMENT_ID": "ZE",
"LEAGUEINDEXES_SOURCE_TYPE": "ZF",
"UPDATEINDEXES_HAS_LIVE_TABLE": "ZO",
"LEAGUEINDEXES_STATS_TYPE": "ZG",
"LEAGUEINDEXES_TOURNAMENT_TEMPLATE_KEY": "ZH",
"LEAGUEINDEXES_TOURNAMENT_STAGE_TYPE": "ZJ",
"LEAGUEINDEXES_TOURNAMENT_TEMPLATE_URL": "ZL",
"LEAGUEINDEXES_SORT_KEY": "ZX",
"LEAGUEINDEXES_STAGES_COUNT":"ZCC",
"FULLFEEDINDEXES_CATEGORY_CAPTION": "ZAF",
"SHAREDINDEXES_HOME_RED_CARD_COUNT": "AJ",
"SHAREDINDEXES_AWAY_RED_CARD_COUNT": "AK"
}
r = requests.get("https://www.scoreboard.com/en/soccer/england/premier-league/results/")
soup = BeautifulSoup(r.text, "html.parser")
script = [t["src"] for t in soup.findAll("script") if t.get("src") and "core_" in t["src"]][0]
data = soup.find("div", {"id": "tournament-page-data-results"}).text
rows = [t.split(DELIMITER_CELL) for t in data.split(DELIMITER_ROW)]
data = []
for r in rows:
rowData = dict([(t[0], t[1])
for t in (
t.split(DELIMITER_VALUE)
for t in r
)
if len(t) > 1
])
for key in indexes.keys():
if indexes[key] in rowData:
rowData[key] = rowData[indexes[key]]
del rowData[indexes[key]]
if "SHAREDINDEXES_PERIOD_START_UTIME" in rowData:
data.append(rowData)
df = pd.DataFrame(data)
print(df)
r = requests.get(f"https://www.scoreboard.com{script}")
dataReg = re.search("feed_sign\s*=\s*'(.*)'", r.text, re.MULTILINE)
token = dataReg.group(1)
#get statistics for the first match df[0,0]
eventId = df["SHAREDINDEXES_EVENT_ID"].values[0]
r = requests.get(f"https://d.scoreboard.com/en/x/feed/d_su_{eventId}_en_1",
headers = {
"x-fsign": token
})
soup = BeautifulSoup(r.text, "html.parser")
#get the stats you want from soup
print(soup)
Try this on repl.it