# 爬取豆瓣图书TOP250,爬取的数据存储到CSV文件中
from lxml import etree
import requests
import csv
# wt是python中以文本写 的方式打开,只能写文件,如果文件不存在则创建该文件
fp = open("D://pytext/douban_top250.csv","w")
writer = csv.writer(fp)
writer.writerow((\'name\', \'url\', \'author\', \'publisher\', \'date\', \'price\', \'rate\', \'comment\'))
# 创建url,range的第三个参数是步长
urls = ["https://book.douban.com/top250?start={}".format(i) for i in range(0,250,25)]
# 请求头,用来模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
for url in urls:
res = requests.get(url,headers=headers)
# lxml库的etree解析html
selector = etree.HTML(res.text)
# 获取的是一页中所有的书本信息,每本的所有信息都在类为item的tr下面
infos = selector.xpath("//tr[@class=\'item\']")
for info in infos:
name = info.xpath(\'td/div/a/@title\')[0]
url = info.xpath(\'td/div/a/@href\')[0]
# 获取的是书本的基本信息,有作者和出版社,和出版日期...
book_infos = info.xpath(\'td/p/text()\')[0]
# 作者
author = book_infos.split(\'/\')[0]
# 出版社
publisher = book_infos.split(\'/\')[-3]
# 出版日期
date = book_infos.split(\'/\')[-2]
# 价格
price = book_infos.split(\'/\')[-1]
# 书本的评分
rate = info.xpath(\'td/div/span[2]/text()\')[0]
# 下面的评论
comments = info.xpath(\'td/p/span/text()\')
# 这里单行的if语句是:如果comments的长度不为0时,则把comments的第1个元素给comment,否则就把"空"赋值给comment
comment = comments[0] if len(comments) != 0 else "空"
writer.writerow((name.encode("utf-8"), url.encode("utf-8"), author.encode("utf-8"), publisher.encode("utf-8"), date.encode("utf-8"), price.encode("utf-8"), rate.encode("utf-8"), comment.encode("utf-8")))
# 关闭
fp.close()
相关文章:
- 爬取豆瓣图书top250 2021-05-11
- 豆瓣获取图书信息 2021-11-12
- python 爬取豆瓣图书 - Louiszj 2021-11-12
- 淘宝图书+豆瓣评论 2021-11-18
- 豆瓣图书 数据分析 2021-10-24
- requests+beautifulsoup爬取豆瓣图书 2021-07-17
- 微信小程序之小豆瓣图书 2021-04-12