xpath之豆瓣图书案例


# 爬取豆瓣图书TOP250,爬取的数据存储到CSV文件中
from lxml import etree
import requests
import csv
# wt是python中以文本写 的方式打开,只能写文件,如果文件不存在则创建该文件
fp = open("D://pytext/douban_top250.csv","w")
writer = csv.writer(fp)
writer.writerow((\'name\', \'url\',  \'author\', \'publisher\', \'date\', \'price\', \'rate\', \'comment\'))
# 创建url,range的第三个参数是步长
urls = ["https://book.douban.com/top250?start={}".format(i) for i in range(0,250,25)]
# 请求头,用来模拟浏览器
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}

for url in urls:
    res = requests.get(url,headers=headers)
    # lxml库的etree解析html
    selector = etree.HTML(res.text)
    # 获取的是一页中所有的书本信息,每本的所有信息都在类为item的tr下面
    infos = selector.xpath("//tr[@class=\'item\']")
    for info in infos:
        name = info.xpath(\'td/div/a/@title\')[0]
        url = info.xpath(\'td/div/a/@href\')[0]
        # 获取的是书本的基本信息,有作者和出版社,和出版日期...
        book_infos = info.xpath(\'td/p/text()\')[0]
        # 作者
        author = book_infos.split(\'/\')[0]
        # 出版社
        publisher = book_infos.split(\'/\')[-3]
        # 出版日期
        date = book_infos.split(\'/\')[-2]
        # 价格
        price = book_infos.split(\'/\')[-1]
        # 书本的评分
        rate = info.xpath(\'td/div/span[2]/text()\')[0]
        # 下面的评论
        comments = info.xpath(\'td/p/span/text()\')
        # 这里单行的if语句是:如果comments的长度不为0时,则把comments的第1个元素给comment,否则就把"空"赋值给comment
        comment = comments[0] if len(comments) != 0 else "空"
        writer.writerow((name.encode("utf-8"), url.encode("utf-8"), author.encode("utf-8"), publisher.encode("utf-8"), date.encode("utf-8"), price.encode("utf-8"), rate.encode("utf-8"), comment.encode("utf-8")))
# 关闭
fp.close()