XPath常用规则
/ 从当前节点选取直接子节点
// 从当前节点选取子孙节点
. 选取当前节点
.. 选取当前节点的父节点
@ 选取属性
* 通配符,选择所有元素节点与元素名
@* 选取所有属性
[@attrib] 选取具有给定属性的所有元素
[@attrib='value'] 选取给定属性具有给定值的所有元素
[tag] 选取所有具有指定元素的直接子节点
[tag='text'] 选取所有具有指定元素并且文本内容是text节点
![]()
"""爬取豆瓣网站的信息"""
import requests
from lxml import etree
# 请求头设置
headers = {
"User-Agentv": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Referer": "https://movie.douban.com/",
}
url = "https://movie.douban.com/cinema/nowplaying/chongqing/"
# 发起请求
rep = requests.get(url, headers=headers)
text = rep.text
# 转换成html格式
html = etree.HTML(text)
# 找到子孙节点ul标签
ul = html.xpath("//ul[@class='lists']")[0]
# 当前ul下的所有li标签
lis = ul.xpath("./li")
movies = []
# 循环每个li标签
for li in lis:
# 直接@li标签的属性获取值
title = li.xpath("@data-title")[0]
score = li.xpath("@data-score")[0]
region = li.xpath("@data-region")[0]
actors = li.xpath("@data-actors")[0]
director = li.xpath("@data-director")[0]
liimg = li.xpath(".//img/@src")
movie = {
"title": title,
"score": score,
"region": region,
"actors": actors,
"director": director,
"liimg": liimg,
}
movies.append(movie)
print(movies)
View Code