爬虫爬小说 - 爱码网

import requests as r
import re,encodings
import time
from lxml import etree
def pa( url,name):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36\'
    }
    z = r.get(url, headers = headers)
    z.encoding = \'UTF-8\'
    html = etree.HTML(z.text)
    # 查找章节名字  
    zhangjie = html.xpath(\'//*[@id="wrapper"]/div[3]/div/div[2]/h1/text()\')[0]

    print(zhangjie)
    # xpath 查找小说内容
    content = html.xpath(\'//*[@id="content"]/text()\')
    content = \'\n\'.join(content)
    with open(name, \'a+\', encoding="UTF-8") as txt:
        txt.write(zhangjie + "\n")
        txt.write(content)
        print(zhangjie + ":\t写入成功")



if __name__ == \'__main__\':
    mulu_url = \'http://www.yuetutu.com/cbook_22694/\'
    \'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36\'
    s = r.get(mulu_url)
    s.encoding = \'utf-8\'
    html = etree.HTML(s.text)
    text = s.text
    name = (re.search(\'<h1>(.*?)</h1>\',text)).group()
    name = (name.replace("<h1>",\'\')).replace(\'</h1>\',\'\')
    name = "./%s.txt"%name
    mulu = html.xpath(\'//*[@id="list"]/dl/dd/a/@href\')

    print(name)
    print(mulu)
    b = 1;
    for i in mulu:
        if b > 8 :
            pa(\'http://www.yuetutu.com\'+i, name)
        b= 1+b
        time.sleep(1)