这是我想出的。
import lxml
from lxml.html.clean import Cleaner
def clean_html(html):
if html:
is_wrap_in_div = check_is_wrap_in_div(html)
cleaner = Cleaner()
html = cleaner.clean_html(html)
if not is_wrap_in_div:
html = remove_root_div(html)
return html
def check_is_wrap_in_div(html):
is_wrapped = False
try:
tree = lxml.etree.fromstring(html)
if tree.tag == 'div':
return True
except lxml.etree.XMLSyntaxError:
pass
return is_wrapped
def remove_root_div(html):
root_div_regex = r'^(\s*<div[\s\S]*?>)([\s\S]*)(<\/div>[\s\S]*?)$'
return re.sub(root_div_regex, r'\2', html)
# use it as
cleaned_html = clean_html(evil_html)
和单元测试
class TestBase(unittest.TestCase):
def test_check_is_wrap_in_div(self):
with self.subTest('test html wrap in div'):
self.assertTrue(
utils.check_is_wrap_in_div('<div></div>'),
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
<div>
<p>Hi</p>
</div>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
""")
)
with self.subTest('test html not wrap in div'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>Hello</p>
<p>World</p>
<div class="testing">
Hello
</div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
</div>
<span>
<div> </div>
</span>
""",
]
for html in html_list:
self.assertFalse(
utils.check_is_wrap_in_div(html),
)
def test_remove_root_div(self):
with self.subTest('test remove root html'):
self.assertEqual(
utils.remove_root_div('<div></div>'),
'',
)
self.assertEqual(
utils.remove_root_div(
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
with self.subTest('test not root html'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
]
for html in html_list:
self.assertEqual(
utils.remove_root_div(html),
html,
)