爬虫 实践 小例子
import requests,os
from urllib import request
from lxml import etree
dirName = \'./books\'
if not os.path.exists(dirName):
os.mkdir(dirName)
headers={
\'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36\'
}
url = \'http://www.shicimingju.com/book\'
page_text = requests.get(url,headers=headers).text
# print(page_text)
tree = etree.HTML(page_text)
a_list = tree.xpath(\'//div[@class="bookmark-list"]//a\')
for a in a_list:
bookname = a.xpath(\'./text()\')[0]
book_path = "http://www.shicimingju.com" + a.xpath(\'./@href\')[0]
#print(bookname,book_path) # 不取第一个元素的话返回的是列表 [\'三国演义\'] [\'/book/sanguoyanyi.html\']
book_page = requests.get(book_path,headers=headers).text
tree = etree.HTML(book_page)
book_a_list = tree.xpath(\'//div[@class="book-mulu"]//a\')
path = dirName + \'/\' + bookname
with open(path,\'w\',encoding=\'utf-8\') as f:
for a in book_a_list:
title = a.xpath(\'./text()\')[0]
detail_path = \'http://www.shicimingju.com\'+a.xpath(\'./@href\')[0]
detail_page = requests.get(detail_path,headers=headers).text
content = etree.HTML(detail_page).xpath(\'//div[@class="chapter_content"]//text()\')
content = \'\'.join(content)
f.write(title+\':\' + content + \'\n\')
print(title,"下载成功")