Python - beautifulsoup4

环境为：

Python3.6
windows
pycharm2017.2.4

# 安装beautifulsoup4
　　pip install beautifulsoup4

# 安装解析器
　　pip install lxml

# 另一个可供选择的解析器是纯Python实现的 html5lib，html5lib的解析方式与浏览器相同
　　pip install html5lib

基本使用

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" >Lacie</a> and
<a href="http://example.com/tillie" class="sister" >Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

#基本使用：容错处理,文档的容错能力指的是在html代码不完整的情况下,使用该模块可以识别该错误。
#使用BeautifulSoup解析上述代码,能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml') #具有容错功能
res=soup.prettify() #处理好缩进，结构化显示
print(res)

标签选择器

　　即直接通过标签名字选择，选择速度快，如果存在多个相同的标签则只返回第一个

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p>first tag</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie<i>this i tag</i></a>,
<a href="http://example.com/lacie" class="sister" >Lacie</a> and
<a href="http://example.com/tillie" class="sister" >Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# 获取标签的名称
# print(soup.head)  # <head><title>The Dormouse's story</title></head>

# 获取标签的属性
# print(soup.p.name)  # p

# 直接获取标签，如果存在多个相同的标签则只返回第一个
# print(soup.p)  # <p>first tag</p>




# 获取标签的内容,

# print(soup.p.string)  # first tag
# print(soup.a.string)  # None
# print(soup.p.text)    # first tag
# print(soup.a.text)    # Elsiethis i tag
# print(soup.a.contents)  # ['Elsie', <i>this i tag</i>]
"""
注意
    contents获取选中标签内的所有的值，包括里面的标签
    string 只能获取当前标签，而无法获取子标签的内容，如果存在子标签，则返回None
    text则获取包括子标签在内的所有值
"""

# 嵌套选择
# print(soup.head.title.string)  # The Dormouse's story
# print(soup.body.a.contents)  # ['Elsie', <i>this i tag</i>]
# print(soup.body.a.text)    # Elsiethis i tag
# print(soup.body.a.string)  # None
# print(soup.body.p.string)  # first tag

# 获取子节点，子孙节点
# print(soup.contents)  # 返回整个HTML页面的所有节点
# print(soup.p.contents)  # ['first tag']
# print(soup.p.children)  # 得到一个迭代器，包含此标签内错有的子节点
# print(list(soup.a.children))  # ['Elsie', <i>this i tag</i>]
# print(soup.p.descendants)  # <generator object descendants at 0x00000162FFB9D570>
# print(list(soup.a.descendants))  # 获取子孙节点,p下所有的标签都会选择出来 ['Elsie', <i>this i tag</i>, 'this i tag']
# for i, child in enumerate(soup.p.descendants):
#     print(i, child)   # 0 first tag

# 获取父节点，祖先节点
# print(soup.a.parent)  # 获取 a 标签
# print(soup.a.parents)  # <generator object parents at 0x0000022F8747D570>
# print(list(soup.a.parents))   # a 标签的父，父，父节点都会找出来，到html节点

# 获取兄弟节点
# print(soup.a.next_siblings)  # 生成器对象 <generator object next_siblings at 0x000002418B9BD570>
# print(list(soup.a.next_siblings))

beautifulsoup4标签选择器