爬虫（五）数据清洗-xpath、BeautifulSoup模块

xpath介绍和lxml安装

xpath表达式

如果正则表达式用的不好，处理html文档很累，有没有其他的方法？

有！就是用xpath，我们可以先将html文件转成xml文档，然后用xpath查找html节点或元素

我们需要安装lxml模块来支持xpath的操作

安装依赖

pip install lxml

解析字符串形式html

text = \'\'\'
<div>
    <ul>
        <li class="item-0"><a href="link1.html">张三</a></li>
        <li class="item-1"><a href="link2.html">李四</a></li>
        <li class="item-inactive"><a href="link3.html">王五</a></li>
        <li class="item-1"><a href="link4.html">赵六</a></li>
        <li class="item-0"><a href="link5.html">老七</a></li>
    </ul>
</div>
\'\'\'

from lxml import etree

# etree.HTML()将字符串解析成了特殊的html对象
html = etree.HTML(text)

print(type(html))

# 将html对象转成字符串
result = etree.tostring(html,encoding="utf-8").decode()

print(result)

解析本地html

解析本地html
爬虫中网页处理方式
在爬虫中，数据获取和数据清洗一体，HTML()
数据获取和数据清洗分开，parse()

from lxml import etree

#  获取本地html文档
html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

result = etree.tostring(html,encoding="utf-8").decode()

print(result)

获取一类标签

from lxml import etree

html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

result = html.xpath(\'//a\')  # 获取所有a标签的信息

print(result)

print(result[3].text)

获取指定属性的标签

from lxml import etree

html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

result1 = html.xpath(\'//li[@class="item-1"]/span\')  # 获取所有span标签的信息
result2 = html.xpath(\'//li[@class="item-100"]/a\')  # 获取指定a标签的信息

print(result1[0].text)

print(result2[0].text)

获取标签的属性

from lxml import etree

html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

result1 = html.xpath(\'//li/@class\')  # 获取所有li标签的属性信息
result2 = html.xpath(\'//li[@class="item-100"]/a/@href\')  # 获取指定a标签的属性信息

print(result1)

print(result2)

获取子标签

<div>
    <ul>
        <li class="item-0"><a href="link1.html">张三</a></li>
        <li class="item-11">
            <a href="link2.html">
                <span class="nnpp">李四</span>
            </a>
            <span>好人</span>
        </li>
        <li class="item-1"><span>小正正</span></li>
        <li class="item-inactive">
            <a href="link3.html">
                <span class="nppp">王五</span>
            </a>
        </li>
        <li class="item-100"><a href="link4.html">赵六</a></li>
        <li class="item-0"><a href="link5.html">老七</a></li>
    </ul>
</div>

from lxml import etree

html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

result1 = html.xpath(\'//li/a\')  # 获取所有li标签下的a标签
result2 = html.xpath(\'//li//span\')  # 获取所有符合条件的子标签

print(result1)

print(result2[0].text)

# 获取li标签里所有的class
result3 = html.xpath("//li//a//@class")

print(result3)

获取标签内容和标签名

from lxml import etree

html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')

# 获取倒数第二个li元素下a标签的内容
result1 = html.xpath(\'//li[last()-1]/a\')

print(result1[0].text)

result2 = html.xpath(\'//li/a\')

print(result2[-2].text)

# 获取 class 值为bold的标签名
result3 = html.xpath("//*[@class=\'bold\']")

print(result3[1].tag)  # tag表示获取标签名

爬取网络段子

import requests
from lxml import etree

url = \'https://www.qiushibaike.com/\'

header = {
    \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58\',
    \'Accept-Language\':\'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6\'
}

response = requests.get(url,headers=header).text

html = etree.HTML(response)

result1 = html.xpath(\'//div//a[@class="recmd-content"]/@href\') # 获取div下的所有a标签href属性信息

for site in result1:
    xurl = "https://www.qiushibaike.com" + site
    response2 = requests.get(xurl,headers=header).text
    html2 = etree.HTML(response2)
    result2 = html2.xpath("//div[@class=\'content\']")
    print(result2[0].text)

爬取贴吧图片

# 图片爬虫
import urllib
import urllib.request
from lxml import etree


class Spider(object):
    def __init__(self):
        self.tiebaName = "车模"
        self.beginPage = 1
        self.endPage = 3
        self.url = "http://tieba.baidu.com/f?"
        self.ua_header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        self.fileName = 1

    # 构造url
    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage + 1):
            pn = (page - 1) * 50
            wo = {\'pn\': pn, \'kw\': self.tiebaName}
            word = urllib.parse.urlencode(wo)
            myurl = self.url + word
            self.loadPage(myurl)

    # 爬取页面内容
    def loadPage(self, url):
        req = urllib.request.Request(url, headers=self.ua_header)
        data = urllib.request.urlopen(req).read()

        html = etree.HTML(data)
        links = html.xpath(\'//div[@class="threadlist_lz clearfix"]/div/a/@href\')

        for link in links:
            link = "http://tieba.baidu.com" + link
            self.loadImages(link)

    # 爬取帖子详情页，获得图片的链接
    def loadImages(self, link):
        req = urllib.request.Request(link, headers=self.ua_header)
        data = urllib.request.urlopen(req).read()
        html = etree.HTML(data)
        links = html.xpath(\'//img[@class="BDE_Image"]/@src\')
        for imageslink in links:
            self.writeImages(imageslink)

    # 通过图片所在链接，爬取图片并保存图片到本地：
    def writeImages(self, imagesLink):
        print("正在存储图片：", self.fileName, "....")

        image = urllib.request.urlopen(imagesLink).read()

        # 保存图片到本地
        file = open(r"C:\\Users\\Administrator\\Desktop\\贴吧图片\\" + str(self.fileName) + ".jpg", "wb")
        file.write(image)

        file.close()

        self.fileName += 1


if __name__ == \'__main__\':
    mySpider = Spider()

    mySpider.tiebaSpider()

BeautifulSoup简介和安装

安装BeautifulSoup

pip install beautifulsoup4

CSS 选择器：beautifulsoup4

和lxml 一样，beautifulsoup 也是一个HTML/XML的解析器
主要的功能也是如何解析和提取HTML/XML 数据

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 解析本地html文件
# soup2 = BeautifulSoup(open(\'index.html\'))

# print(soup)

# prettify 格式化输出soup对象（美化）
print(soup.prettify()) # prettify是用来美化代码

获取标签信息

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容

# 获取标签名
print(soup.title.name)

# 获取标签内所有属性
print(soup.p.attrs[\'name\']) # 获取p标签内name属性值

# 获取直接子标签，结果是一个列表
print(soup.head.contents)

# 获取直接子标签，结果是一个生成器
print(soup.head.children)

# 获取所有的子标签
print(soup.descendants)

for i in soup.p.descendants: # 打印所有p标签信息，标签内的信息又会独立作为一个信息展示
    print(i)

搜索文档树

文档树，即：所有标签

# 搜索文档树 find_all()

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容

# 根据字符串查找所有的a标签,返回一个结果集，里面装的是标签对象
data = soup.find_all("a")
print(type(data)) # <class \'bs4.element.ResultSet\'>集合
print(data[0].string)
for i in data:
    print(i.string)

# 搜索文档树 find_all()

from bs4 import BeautifulSoup
import re

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
# print(soup.title)
# print(soup.title.string) # 标签里面的内容

# 方式一
# 根据字符串查找所有的a标签,返回一个结果集，里面装的是标签对象
data = soup.find_all("a")
# print(type(data)) # <class \'bs4.element.ResultSet\'>集合
# print(data[0].string)
# for i in data:
#     print(i.string)

# 方式二
# 根据正则表达式查找标签
data2 = soup.find_all(re.compile("^b"))
for i in data2:
    print(i.string)

# 方式三
# 根据属性查找标签
data3 = soup.find_all(id="link2")
for i in data3:
    print(i)

# 方式四
# 根据标签内容获取标签内容
data4 = soup.find_all(text="Lacie")
data5 = soup.find_all(text=[\'Lacie\',\'Tillie\'])
data6 = soup.find_all(text=re.compile("Do")) # 查找包含Do文本的数据
print(data4)
print(data5)
print(data6)

CSS选择器

CSS选择器 - 通过select来查找
根据样式表来查找标签
CSS选择器类型：标签选择器、class选择器、id选择器

# 搜索文档树 find_all()

from bs4 import BeautifulSoup
import re

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# CSS选择器类型：标签选择器、class选择器、id选择器

# 通过标签名获取标签
data = soup.select(\'a\')
print(type(data)) # <class \'bs4.element.ResultSet\'>集合
print(data)

# 通过class名来查找
data2 = soup.select(".sister")
print(data2)

# 通过id来查找
data3 = soup.select("#link2")
print(data3)

# 通过组合查找
data4 = soup.select("p #link1") # p标签下面id为link1的标签
print(data4)

# 通过其他属性查找
data5 = soup.select(\'a[href="http://example.com/lacie"]\')
print(data5)

实战-爬取腾讯招聘信息

在爬取腾讯招聘信息前先分析url

import urllib.request
import time
import re

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84."
                         "0.4147.125 Safari/537.36 Edg/84.0.522.59"}

timestamp = int(time.time())
# print(timestamp)

for x in range(1, 3):
    page = x
    url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=" + str(timestamp) + "&pageIndex=" + str(page) + "&pageSize=10"
    # print(url)

    req = urllib.request.Request(url, headers=headers)
    data = urllib.request.urlopen(req).read().decode()
    # print(data)

    part = r\'PostId":"(.*?)",\'
    pattern = re.compile(part)

    data1 = pattern.findall(data)

    # print(data1)

    for x in data1:
        myurl = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=" + str(timestamp) + "&postId=" + str(x) + "&language=zh-cn"
        # print(myurl)
        req2 = urllib.request.Request(myurl,headers=headers)
        data3 = urllib.request.urlopen(req2).read().decode()

        # print(data3)

        part2 = r\'RecruitPostName":"(.*?)",\'
        pattern2 = re.compile(part2)
        name = pattern2.findall(data3)

        part3 = r\'Responsibility":"(.*?)",\'
        pattern3 = re.compile(part3)
        text = pattern3.findall(data3)
        
        print(name)
        print(text)


    print("-------------------------------------------------")