person1-0-1
import requests
import os
from lxml import html
import time


def get_title_url(tree):
    \'\'\'一级  获取标题\'\'\'
    # 史书典籍
    # 格式:/book/sanguoyanyi.html
    History_book_url_list = tree.xpath("//div[@class=\'index-li\'][3]/ul/li/a/@href")
    # 格式:三国演义
    History_book_name_list = tree.xpath("//div[@class=\'index-li\'][3]/ul/li/a/text()")
    return History_book_url_list,History_book_name_list


def get_article_url(tree):
    \'\'\'二级  获取文章标题\'\'\'
    # 三国演义典籍
    # 格式:/book/sanguoyanyi/1.html
    book_url_list = tree.xpath("//div[@class=\'book-mulu\']/ul/li/a/@href")
    # 格式:第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    book_name_list = tree.xpath("//div[@class=\'book-mulu\']/ul/li/a/text()")
    return book_url_list,book_name_list


def get_article(tree):
    \'\'\'三级  获取文章内容\'\'\'
    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    # 格式:/book/sanguoyanyi/1.html
    article_list = tree.xpath("//div[@class=\'chapter_content\']/p/text()")
    return \'\'.join(article_list)

def get_request(url,headers):
    \'\'\'获取页面\'\'\'
    response = requests.get(url=url,headers=headers)
    tree = html.fromstring(response.text)
    return tree

def save_mkdir(two):
    \'\'\'三级  保存文章夹\'\'\'
    # 一级文件夹
    if os.path.exists(\'史书典籍\'):
        pass
    else:
        os.mkdir(\'史书典籍\')
    # 二级文件夹
    if os.path.exists(\'史书典籍/\'+ two):
        pass
    else:
        os.mkdir(\'史书典籍/\'+ two)

def police_2(a):
    \'\'\'二级中断检测\'\'\'
    b = None
    if os.path.exists(\'史书典籍/police_2.txt\'):
        with open(\'史书典籍/police_2.txt\', \'r\') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is \'\':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open(\'史书典籍/police_2.txt\', \'w\') as f:
        f.write(str(a))
        f.close()
        return True



def police_3(a):
    \'\'\'三级中断检测\'\'\'
    b = None
    if os.path.exists(\'史书典籍/police_3.txt\'):
        with open(\'史书典籍/police_3.txt\', \'r\') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is \'\':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open(\'史书典籍/police_3.txt\', \'w\') as f:
        f.write(str(a))
        f.close()
        return True


def main():
    \'\'\'主函数\'\'\'
    # 根路由
    root = \'http://www.shicimingju.com\'
    # 头部
    headers = {
        \'user-agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36\'
    }


    # 获取root页面
    tree1 = get_request(root,headers)
    # 获取一级名字和路由
    History_book_url_list, History_book_name_list = get_title_url(tree1)
    # 获取二级页面
    for i in range(len(History_book_url_list)):
        if police_2(i) is False:
            continue
        # 二级路由
        url2 = root + History_book_url_list[i]
        print("爬取>>>"+History_book_name_list[i]+\'开始\')
        tree2 = get_request(url2,headers)
        # 获取二级名字和路由
        book_url_list,book_name_list = get_article_url(tree2)
        # 文章夹保存
        save_mkdir(History_book_name_list[i])
        # 下载文章
        for j in range(len(book_url_list)):
            if police_3(j) is False:
                continue
            time.sleep(1)
            # 三级路由
            url3 = root + book_url_list[j]
            print("爬取:" + book_name_list[j])
            # 文章
            tree3 = get_request(url3, headers)
            txt = get_article(tree3)
            # 文章标题
            txt_name = book_name_list[j]
            # 文章保存
            file_path = \'史书典籍/{}/{}.txt\'.format(History_book_name_list[i],(txt_name.replace(\' \',\'\')).replace(\'·\',\'\'))
            with open(file_path,\'w\',encoding=\'utf-8\') as f:
                f.write(txt)
                f.close()
        print("爬取>>>" + History_book_name_list[i] + \'结束\')



if __name__ == \'__main__\':
    main()

 

分类:

技术点:

相关文章: