【发布时间】:2022-11-18 07:27:05
【问题描述】:
我迷路了。由于目录中存在文件,您将无法运行代码。有谁知道为什么会这样?下面是代码和执行错误。它运行到 1900 才停止。为什么是1900?跑了5次,一直都是1900,如果马上就崩溃了,我会更理解这个问题,但是运行了,然后没有跑到一半?
import os
import pandas as pd
#this replace parse_data_live
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]
from bs4 import BeautifulSoup
def parse_html(box_score):
with open(box_score, encoding="utf-8") as f:
html = f.read()
#with open(box_score) as f:
#html = f.read()
soup = BeautifulSoup(html, 'lxml')
[s.decompose() for s in soup.select("tr.over_header")]
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all('a')]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
def read_line_score(soup):
line_score = pd.read_html(str(soup), attrs={'id': 'line_score'})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
df = df.apply(pd.to_numeric, errors="coerce")
return df
games = []
base_cols = None
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"])
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
totals.index = totals.index.str.lower()
maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pd.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary = summary[base_cols]
summaries.append(summary)
summary = pd.concat(summaries, axis=1).T
game = pd.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pd.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info(soup)
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")
games_df = pd.concat(games, ignore_index=True)
print(games_df)
games_df.to_csv("nba_games.csv")
#outcome
100 / 8394
200 / 8394
300 / 8394
400 / 8394
500 / 8394
600 / 8394
700 / 8394
800 / 8394
900 / 8394
1000 / 8394
1100 / 8394
1200 / 8394
1300 / 8394
1400 / 8394
1500 / 8394
1600 / 8394
1700 / 8394
1800 / 8394
1900 / 8394
Traceback (most recent call last):
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 730, in _build_doc
r = parse(self.io, parser=parser)
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 937, in parse
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
File "src\lxml\etree.pyx", line 3538, in lxml.etree.parse
File "src\lxml\parser.pxi", line 1876, in lxml.etree._parseDocument
File "src\lxml\parser.pxi", line 1902, in lxml.etree._parseDocumentFromURL
File "src\lxml\parser.pxi", line 1805, in lxml.etree._parseDocFromFile
File "src\lxml\parser.pxi", line 1177, in lxml.etree._BaseParser._parseDocFromFile
File "src\lxml\parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
File "src\lxml\parser.pxi", line 725, in lxml.etree._handleParseResult
File "src\lxml\parser.pxi", line 652, in lxml.etree._raiseParseError
OSError: Error reading file '': failed to load external entity ""
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Martin\PycharmProjects\Dog\nba game.py", line 52, in <module>
line_score = read_line_score(soup)
File "C:\Users\Martin\PycharmProjects\Dog\nba game.py", line 30, in read_line_score
line_score = pd.read_html(str(soup), attrs={'id': 'line_score'})[0]
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 1098, in read_html
return _parse(
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 906, in _parse
tables = p.parse_tables()
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 222, in parse_tables
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 738, in _build_doc
r = fromstring(self.io, parser=parser)
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
【问题讨论】:
-
OSError: Error reading file '': failed to load external entity ""你的列表里好像有一个空文件 -
夫妇的事情。 1) 它不会在 1900 时崩溃。它实际上会在 >= 1900 的地方崩溃。2) 在 html 中添加打印语句。这样你就可以看到它在哪个 url 崩溃了,然后你可以在浏览器中打开它并检查它,看看问题是什么。最后,如果您不提供重现此内容的所有内容,任何人都不可能帮助您。如果你需要帮助,你需要在
box_scores中包含 html 列表