lyqLovellx
import requests
from lxml import etree
import csv
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36\'
}

\'\'\'
需求分析:爬取豆瓣读书Top250
需要的信息:书名、url、作者、出版社、出版年、定价、评分、多少人评价、 评价
将爬取的信息保存为csv文件

\'\'\'
def get_info(url):
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
infos = html.xpath(\'//tr[@class="item"]\')
for info in infos:
book_name = info.xpath(\'./td/div/a/@title\')[0].strip() # 书名
url = info.xpath(\'./td/div/a/@href\')[0] #url
book_info = info.xpath(\'./td/p/text()\')[0].strip()
info_len = book_info.split(\'/\')
score = info.xpath(\'./td/div/span[2]/text()\')[0] # 评分
ev_number = info.xpath(\'./td/div/span[3]/text()\')[0].strip(\'()\').strip() # 评价人数
co_abstracts = info.xpath(\'./td/p/span/text()\') # 内容简介
co_abstract = co_abstracts[0] if len(co_abstracts) != 0 else ""
if len(info_len) == 4:
author = book_info.split(\'/\')[0] # 作者
press = book_info.split(\'/\')[-3] # 出版社
year = book_info.split(\'/\')[-2] # 出版年
price = book_info.split(\'/\')[-1].strip() # 定价
else:
author = book_info.split(\'/\')[0:2] # 作者
press = book_info.split(\'/\')[-3] # 出版社
year = book_info.split(\'/\')[-2] # 出版年
price = book_info.split(\'/\')[-1].strip() # 定价

wr.writerow((book_name,url,author,press,year,price,score,ev_number,co_abstract))

if __name__ ==\'__main__\':
f = open(\'豆瓣读书.csv\',\'wt\',newline=\'\',encoding=\'utf-8\')
wr = csv.writer(f)
wr.writerow((\'书名\',\'URL\',\'作者\',\'出版社\',\'出版年\',\'定价\',\'评分\',\'评价人数\',\'评价\'))
urls = [\'https://book.douban.com/top250?start={}\'.format(str(num)) for num in range(0,250,25)]
for url in urls:
get_info(url)

f.close()


结果展示


分类:

技术点:

相关文章: