Python ParseError 文档为空答案

【问题标题】：Python ParseError Document is emptyPython ParseError 文档为空
【发布时间】：2022-11-18 07:27:05
【问题描述】：

我迷路了。由于目录中存在文件，您将无法运行代码。有谁知道为什么会这样？下面是代码和执行错误。它运行到 1900 才停止。为什么是1900？跑了5次，一直都是1900，如果马上就崩溃了，我会更理解这个问题，但是运行了，然后没有跑到一半？

import os
import pandas as pd
#this replace parse_data_live
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]
from bs4 import BeautifulSoup


def parse_html(box_score):
    with open(box_score, encoding="utf-8") as f:
        html = f.read()
    #with open(box_score) as f:
        #html = f.read()

    soup = BeautifulSoup(html, 'lxml')
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup


def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season


def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    line_score = line_score[["team", "total"]]

    return line_score


def read_stats(soup, team, stat):
    df = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df


games = []
base_cols = None
for box_score in box_scores:
    soup = parse_html(box_score)

    line_score = read_line_score(soup)
    teams = list(line_score["team"])

    summaries = []
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")

        totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
        maxes.index = maxes.index.str.lower() + "_max"

        summary = pd.concat([totals, maxes])

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]

        summary = summary[base_cols]

        summaries.append(summary)
    summary = pd.concat(summaries, axis=1).T

    game = pd.concat([summary, line_score], axis=1)

    game["home"] = [0, 1]

    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis=1)
    full_game["season"] = read_season_info(soup)

    full_game["date"] = os.path.basename(box_score)[:8]
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")

    full_game["won"] = full_game["total"] > full_game["total_opp"]
    games.append(full_game)

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")
games_df = pd.concat(games, ignore_index=True)
print(games_df)
games_df.to_csv("nba_games.csv")




#outcome

100 / 8394
200 / 8394
300 / 8394
400 / 8394
500 / 8394
600 / 8394
700 / 8394
800 / 8394
900 / 8394
1000 / 8394
1100 / 8394
1200 / 8394
1300 / 8394
1400 / 8394
1500 / 8394
1600 / 8394
1700 / 8394
1800 / 8394
1900 / 8394
Traceback (most recent call last):
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 730, in _build_doc
    r = parse(self.io, parser=parser)
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 937, in parse
    return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
  File "src\lxml\etree.pyx", line 3538, in lxml.etree.parse
  File "src\lxml\parser.pxi", line 1876, in lxml.etree._parseDocument
  File "src\lxml\parser.pxi", line 1902, in lxml.etree._parseDocumentFromURL
  File "src\lxml\parser.pxi", line 1805, in lxml.etree._parseDocFromFile
  File "src\lxml\parser.pxi", line 1177, in lxml.etree._BaseParser._parseDocFromFile
  File "src\lxml\parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
  File "src\lxml\parser.pxi", line 725, in lxml.etree._handleParseResult
  File "src\lxml\parser.pxi", line 652, in lxml.etree._raiseParseError
OSError: Error reading file '': failed to load external entity ""

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Martin\PycharmProjects\Dog\nba game.py", line 52, in <module>
    line_score = read_line_score(soup)
  File "C:\Users\Martin\PycharmProjects\Dog\nba game.py", line 30, in read_line_score
    line_score = pd.read_html(str(soup), attrs={'id': 'line_score'})[0]
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 1098, in read_html
    return _parse(
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 906, in _parse
    tables = p.parse_tables()
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 222, in parse_tables
    tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\pandas\io\html.py", line 738, in _build_doc
    r = fromstring(self.io, parser=parser)
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 873, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "C:\Users\Martin\PycharmProjects\Dog\venv\lib\site-packages\lxml\html\__init__.py", line 761, in document_fromstring
    raise etree.ParserError(
lxml.etree.ParserError: Document is empty

Process finished with exit code 1

【问题讨论】：

OSError: Error reading file '': failed to load external entity "" 你的列表里好像有一个空文件
夫妇的事情。 1) 它不会在 1900 时崩溃。它实际上会在 >= 1900 的地方崩溃。2) 在 html 中添加打印语句。这样你就可以看到它在哪个 url 崩溃了，然后你可以在浏览器中打开它并检查它，看看问题是什么。最后，如果您不提供重现此内容的所有内容，任何人都不可能帮助您。如果你需要帮助，你需要在 box_scores 中包含 html 列表

标签： python pandas parsing

【解决方案1】：

我在同一个项目上工作，遇到了同样的问题。

运行这段代码后，我发现了 3 个空框分数：

for b in box_scores:
    if os.path.getsize(b) == 0: # check for empty files
        print(b)

我使用以下列表理解删除了 3 个空文件：

box_scores = [b for b in box_scores if os.path.getsize(b) != 0]

目前正在重新运行代码，会让您知道它是否有效。

【讨论】：