常规功能
这是一个实现,它处理那些 <br> 标签(不仅仅是字符串)的任意兄弟标签:
from bs4 import BeautifulSoup, Tag
def breaks_to_paragraphs(
tag: Tag,
soup: BeautifulSoup,
recursive: bool = False,
) -> None:
"""
If `tag` contains <br> elements, it is split into `<p>` tags instead.
The `<br>` tags are removed from `tag`.
If no `<br>` tags are found, this function does nothing.
Args:
tag:
The `Tag` instance to mutate
soup:
The `BeautifulSoup` instance the tag belongs to (for `new_tag`)
recursive (optional):
If `True`, the function is applied to all nested tags recursively;
otherwise (default) only the children are affected.
"""
elements = []
contains_br = False
for child in list(tag.children):
if isinstance(child, Tag) and child.name != "br":
if recursive:
breaks_to_paragraphs(child, soup, recursive=recursive)
elements.append(child)
elif not isinstance(child, Tag): # it is a `NavigableString`
elements.append(child)
else: # it is a `<br>` tag
contains_br = True
p = soup.new_tag("p")
child.replace_with(p)
p.extend(elements)
elements.clear()
if elements and contains_br:
p = soup.new_tag("p")
tag.append(p)
p.extend(elements)
soup.smooth()
子类方法
或者,由于您需要原始的 BeautifulSoup 实例来调用 new_tag 方法,您也可以将其子类化并将其实现为方法:
from bs4 import BeautifulSoup, Tag
class CustomSoup(BeautifulSoup):
def breaks_to_paragraphs(self, tag: Tag, recursive: bool = False) -> None:
"""
If `tag` contains <br> elements, it is split into `<p>` tags instead.
The `<br>` tags are removed from `tag`.
If no `<br>` tags are found, this method does nothing.
Args:
tag:
The `Tag` instance to mutate
recursive (optional):
If `True`, the function is applied to all nested tags recursively;
otherwise (default) only the children are affected.
"""
elements = []
contains_br = False
for child in list(tag.children):
if isinstance(child, Tag) and child.name != "br":
if recursive:
self.breaks_to_paragraphs(child, recursive=recursive)
elements.append(child)
elif not isinstance(child, Tag): # it is a `NavigableString`
elements.append(child)
else: # it is a `<br>` tag
contains_br = True
p = self.new_tag("p")
child.replace_with(p)
p.extend(elements)
elements.clear()
if elements and contains_br:
p = self.new_tag("p")
tag.append(p)
p.extend(elements)
self.smooth()
演示
这是一个快速测试:
...
def main() -> None:
html = """
<p>
First paragraph. <br/>
Second paragraph.<br/>
<span>foo</span>
<span>bar<br>baz</span>
</p>
"""
soup = CustomSoup(html, "html.parser")
soup.breaks_to_paragraphs(soup.p)
print(soup.p.prettify())
if __name__ == "__main__":
main()
输出:
<p>
<p>
First paragraph.
</p>
<p>
Second paragraph.
</p>
<p>
<span>
foo
</span>
<span>
bar
<br/>
baz
</span>
</p>
</p>
如果您改为使用 soup.breaks_to_paragraphs(soup.p, recursive=True) 调用它:
<p>
<p>
First paragraph.
</p>
<p>
Second paragraph.
</p>
<p>
<span>
foo
</span>
<span>
<p>
bar
</p>
<p>
baz
</p>
</span>
</p>
</p>
注意它是如何沿着嵌套的 <br> 拆分成 <p> 标签的。