xpath介绍和lxml安装
xpath表达式
如果正则表达式用的不好,处理html文档很累,有没有其他的方法?
有!就是用xpath,我们可以先将html文件转成xml文档,然后用xpath查找html节点或元素
我们需要安装lxml模块来支持xpath的操作
安装依赖
pip install lxml
解析字符串形式html
text = \'\'\'
<div>
<ul>
<li class="item-0"><a href="link1.html">张三</a></li>
<li class="item-1"><a href="link2.html">李四</a></li>
<li class="item-inactive"><a href="link3.html">王五</a></li>
<li class="item-1"><a href="link4.html">赵六</a></li>
<li class="item-0"><a href="link5.html">老七</a></li>
</ul>
</div>
\'\'\'
from lxml import etree
# etree.HTML()将字符串解析成了特殊的html对象
html = etree.HTML(text)
print(type(html))
# 将html对象转成字符串
result = etree.tostring(html,encoding="utf-8").decode()
print(result)
解析本地html
解析本地html
爬虫中网页处理方式
在爬虫中,数据获取和数据清洗一体,HTML()
数据获取和数据清洗分开,parse()
from lxml import etree # 获取本地html文档 html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\') result = etree.tostring(html,encoding="utf-8").decode() print(result)
获取一类标签
from lxml import etree html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\') result = html.xpath(\'//a\') # 获取所有a标签的信息 print(result) print(result[3].text)
获取指定属性的标签
from lxml import etree html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\') result1 = html.xpath(\'//li[@class="item-1"]/span\') # 获取所有span标签的信息 result2 = html.xpath(\'//li[@class="item-100"]/a\') # 获取指定a标签的信息 print(result1[0].text) print(result2[0].text)
获取标签的属性
from lxml import etree html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\') result1 = html.xpath(\'//li/@class\') # 获取所有li标签的属性信息 result2 = html.xpath(\'//li[@class="item-100"]/a/@href\') # 获取指定a标签的属性信息 print(result1) print(result2)
获取子标签
<div>
<ul>
<li class="item-0"><a href="link1.html">张三</a></li>
<li class="item-11">
<a href="link2.html">
<span class="nnpp">李四</span>
</a>
<span>好人</span>
</li>
<li class="item-1"><span>小正正</span></li>
<li class="item-inactive">
<a href="link3.html">
<span class="nppp">王五</span>
</a>
</li>
<li class="item-100"><a href="link4.html">赵六</a></li>
<li class="item-0"><a href="link5.html">老七</a></li>
</ul>
</div>
from lxml import etree
html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')
result1 = html.xpath(\'//li/a\') # 获取所有li标签下的a标签
result2 = html.xpath(\'//li//span\') # 获取所有符合条件的子标签
print(result1)
print(result2[0].text)
# 获取li标签里所有的class
result3 = html.xpath("//li//a//@class")
print(result3)
获取标签内容和标签名
from lxml import etree
html = etree.parse(r\'C:\Users\Administrator\PycharmProjects\Reptiles\a.html\')
# 获取倒数第二个li元素下a标签的内容
result1 = html.xpath(\'//li[last()-1]/a\')
print(result1[0].text)
result2 = html.xpath(\'//li/a\')
print(result2[-2].text)
# 获取 class 值为bold的标签名
result3 = html.xpath("//*[@class=\'bold\']")
print(result3[1].tag) # tag表示获取标签名
爬取网络段子
import requests
from lxml import etree
url = \'https://www.qiushibaike.com/\'
header = {
\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58\',
\'Accept-Language\':\'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6\'
}
response = requests.get(url,headers=header).text
html = etree.HTML(response)
result1 = html.xpath(\'//div//a[@class="recmd-content"]/@href\') # 获取div下的所有a标签href属性信息
for site in result1:
xurl = "https://www.qiushibaike.com" + site
response2 = requests.get(xurl,headers=header).text
html2 = etree.HTML(response2)
result2 = html2.xpath("//div[@class=\'content\']")
print(result2[0].text)
爬取贴吧图片
# 图片爬虫
import urllib
import urllib.request
from lxml import etree
class Spider(object):
def __init__(self):
self.tiebaName = "车模"
self.beginPage = 1
self.endPage = 3
self.url = "http://tieba.baidu.com/f?"
self.ua_header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
self.fileName = 1
# 构造url
def tiebaSpider(self):
for page in range(self.beginPage, self.endPage + 1):
pn = (page - 1) * 50
wo = {\'pn\': pn, \'kw\': self.tiebaName}
word = urllib.parse.urlencode(wo)
myurl = self.url + word
self.loadPage(myurl)
# 爬取页面内容
def loadPage(self, url):
req = urllib.request.Request(url, headers=self.ua_header)
data = urllib.request.urlopen(req).read()
html = etree.HTML(data)
links = html.xpath(\'//div[@class="threadlist_lz clearfix"]/div/a/@href\')
for link in links:
link = "http://tieba.baidu.com" + link
self.loadImages(link)
# 爬取帖子详情页,获得图片的链接
def loadImages(self, link):
req = urllib.request.Request(link, headers=self.ua_header)
data = urllib.request.urlopen(req).read()
html = etree.HTML(data)
links = html.xpath(\'//img[@class="BDE_Image"]/@src\')
for imageslink in links:
self.writeImages(imageslink)
# 通过图片所在链接,爬取图片并保存图片到本地:
def writeImages(self, imagesLink):
print("正在存储图片:", self.fileName, "....")
image = urllib.request.urlopen(imagesLink).read()
# 保存图片到本地
file = open(r"C:\\Users\\Administrator\\Desktop\\贴吧图片\\" + str(self.fileName) + ".jpg", "wb")
file.write(image)
file.close()
self.fileName += 1
if __name__ == \'__main__\':
mySpider = Spider()
mySpider.tiebaSpider()
BeautifulSoup简介和安装
安装BeautifulSoup
pip install beautifulsoup4
CSS 选择器:beautifulsoup4
- 和lxml 一样,beautifulsoup 也是一个HTML/XML的解析器
- 主要的功能也是如何解析和提取HTML/XML 数据
from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse\'s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 解析字符串形式的html soup = BeautifulSoup(html,"lxml") # 解析本地html文件 # soup2 = BeautifulSoup(open(\'index.html\')) # print(soup) # prettify 格式化输出soup对象(美化) print(soup.prettify()) # prettify是用来美化代码
获取标签信息
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容
# 获取标签名
print(soup.title.name)
# 获取标签内所有属性
print(soup.p.attrs[\'name\']) # 获取p标签内name属性值
# 获取直接子标签,结果是一个列表
print(soup.head.contents)
# 获取直接子标签,结果是一个生成器
print(soup.head.children)
# 获取所有的子标签
print(soup.descendants)
for i in soup.p.descendants: # 打印所有p标签信息,标签内的信息又会独立作为一个信息展示
print(i)
搜索文档树
文档树,即:所有标签
# 搜索文档树 find_all()
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容
# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
print(type(data)) # <class \'bs4.element.ResultSet\'>集合
print(data[0].string)
for i in data:
print(i.string)
# 搜索文档树 find_all()
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
# print(soup.title)
# print(soup.title.string) # 标签里面的内容
# 方式一
# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
# print(type(data)) # <class \'bs4.element.ResultSet\'>集合
# print(data[0].string)
# for i in data:
# print(i.string)
# 方式二
# 根据正则表达式查找标签
data2 = soup.find_all(re.compile("^b"))
for i in data2:
print(i.string)
# 方式三
# 根据属性查找标签
data3 = soup.find_all(id="link2")
for i in data3:
print(i)
# 方式四
# 根据标签内容获取标签内容
data4 = soup.find_all(text="Lacie")
data5 = soup.find_all(text=[\'Lacie\',\'Tillie\'])
data6 = soup.find_all(text=re.compile("Do")) # 查找包含Do文本的数据
print(data4)
print(data5)
print(data6)
CSS选择器
CSS选择器 - 通过select来查找
根据样式表来查找标签
CSS选择器类型:标签选择器、class选择器、id选择器
# 搜索文档树 find_all()
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# CSS选择器类型:标签选择器、class选择器、id选择器
# 通过标签名获取标签
data = soup.select(\'a\')
print(type(data)) # <class \'bs4.element.ResultSet\'>集合
print(data)
# 通过class名来查找
data2 = soup.select(".sister")
print(data2)
# 通过id来查找
data3 = soup.select("#link2")
print(data3)
# 通过组合查找
data4 = soup.select("p #link1") # p标签下面id为link1的标签
print(data4)
# 通过其他属性查找
data5 = soup.select(\'a[href="http://example.com/lacie"]\')
print(data5)
实战-爬取腾讯招聘信息
在爬取腾讯招聘信息前先分析url
import urllib.request
import time
import re
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84."
"0.4147.125 Safari/537.36 Edg/84.0.522.59"}
timestamp = int(time.time())
# print(timestamp)
for x in range(1, 3):
page = x
url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=" + str(timestamp) + "&pageIndex=" + str(page) + "&pageSize=10"
# print(url)
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read().decode()
# print(data)
part = r\'PostId":"(.*?)",\'
pattern = re.compile(part)
data1 = pattern.findall(data)
# print(data1)
for x in data1:
myurl = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=" + str(timestamp) + "&postId=" + str(x) + "&language=zh-cn"
# print(myurl)
req2 = urllib.request.Request(myurl,headers=headers)
data3 = urllib.request.urlopen(req2).read().decode()
# print(data3)
part2 = r\'RecruitPostName":"(.*?)",\'
pattern2 = re.compile(part2)
name = pattern2.findall(data3)
part3 = r\'Responsibility":"(.*?)",\'
pattern3 = re.compile(part3)
text = pattern3.findall(data3)
print(name)
print(text)
print("-------------------------------------------------")