import requests import os from lxml import html import time def get_title_url(tree): \'\'\'一级 获取标题\'\'\' # 史书典籍 # 格式:/book/sanguoyanyi.html History_book_url_list = tree.xpath("//div[@class=\'index-li\'][3]/ul/li/a/@href") # 格式:三国演义 History_book_name_list = tree.xpath("//div[@class=\'index-li\'][3]/ul/li/a/text()") return History_book_url_list,History_book_name_list def get_article_url(tree): \'\'\'二级 获取文章标题\'\'\' # 三国演义典籍 # 格式:/book/sanguoyanyi/1.html book_url_list = tree.xpath("//div[@class=\'book-mulu\']/ul/li/a/@href") # 格式:第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 book_name_list = tree.xpath("//div[@class=\'book-mulu\']/ul/li/a/text()") return book_url_list,book_name_list def get_article(tree): \'\'\'三级 获取文章内容\'\'\' # 第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 # 格式:/book/sanguoyanyi/1.html article_list = tree.xpath("//div[@class=\'chapter_content\']/p/text()") return \'\'.join(article_list) def get_request(url,headers): \'\'\'获取页面\'\'\' response = requests.get(url=url,headers=headers) tree = html.fromstring(response.text) return tree def save_mkdir(two): \'\'\'三级 保存文章夹\'\'\' # 一级文件夹 if os.path.exists(\'史书典籍\'): pass else: os.mkdir(\'史书典籍\') # 二级文件夹 if os.path.exists(\'史书典籍/\'+ two): pass else: os.mkdir(\'史书典籍/\'+ two) def police_2(a): \'\'\'二级中断检测\'\'\' b = None if os.path.exists(\'史书典籍/police_2.txt\'): with open(\'史书典籍/police_2.txt\', \'r\') as f: b = f.read() f.close() if b is None: return True elif b is \'\': return True if a < int(b): return False # 写入并返回True with open(\'史书典籍/police_2.txt\', \'w\') as f: f.write(str(a)) f.close() return True def police_3(a): \'\'\'三级中断检测\'\'\' b = None if os.path.exists(\'史书典籍/police_3.txt\'): with open(\'史书典籍/police_3.txt\', \'r\') as f: b = f.read() f.close() if b is None: return True elif b is \'\': return True if a < int(b): return False # 写入并返回True with open(\'史书典籍/police_3.txt\', \'w\') as f: f.write(str(a)) f.close() return True def main(): \'\'\'主函数\'\'\' # 根路由 root = \'http://www.shicimingju.com\' # 头部 headers = { \'user-agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36\' } # 获取root页面 tree1 = get_request(root,headers) # 获取一级名字和路由 History_book_url_list, History_book_name_list = get_title_url(tree1) # 获取二级页面 for i in range(len(History_book_url_list)): if police_2(i) is False: continue # 二级路由 url2 = root + History_book_url_list[i] print("爬取>>>"+History_book_name_list[i]+\'开始\') tree2 = get_request(url2,headers) # 获取二级名字和路由 book_url_list,book_name_list = get_article_url(tree2) # 文章夹保存 save_mkdir(History_book_name_list[i]) # 下载文章 for j in range(len(book_url_list)): if police_3(j) is False: continue time.sleep(1) # 三级路由 url3 = root + book_url_list[j] print("爬取:" + book_name_list[j]) # 文章 tree3 = get_request(url3, headers) txt = get_article(tree3) # 文章标题 txt_name = book_name_list[j] # 文章保存 file_path = \'史书典籍/{}/{}.txt\'.format(History_book_name_list[i],(txt_name.replace(\' \',\'\')).replace(\'·\',\'\')) with open(file_path,\'w\',encoding=\'utf-8\') as f: f.write(txt) f.close() print("爬取>>>" + History_book_name_list[i] + \'结束\') if __name__ == \'__main__\': main()