roadzhao
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  1 12:31:07 2020

@author: zhaolulu
"""
import pandas as pd
import requests
from lxml import etree


headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}

def url_read(url):
    try:
        reponse = requests.get(url,headers=headers)
    except:
        print(\'failed\')
    return reponse.content.decode(\'utf-8\')

if __name__==\'__main__\':
    #笔趣阁小说网站
    url=\'http://www.xbiquge.la/\'
    text = url_read(url)
    print("============================================")
    selector=etree.HTML(text)
    #这个是主页上最新小说的url
    ret=selector.xpath(\'//*[@id="newscontent"]/div[1]/ul/li/span[2]/a//@href\')
    for note_url in ret:
        print(note_url)
    # 这选择了其中一条url 做测试
    #print(ret[0]) # http://www.xbiquge.la/62/62585/
    n_text = url_read(\'http://www.xbiquge.la/62/62585/\')
    n_html=etree.HTML(n_text)
    xpath_ret = n_html.xpath(\'//*[@id="list"]/dl/dd/a/@href\')
    index =0;
    for t_url in xpath_ret:
        #具体的章节内容
        f_url = \'http://www.xbiquge.la\'+t_url
        print(f_url)
        article = url_read(f_url)
        article_text=etree.HTML(article)
        article_detail=article_text.xpath(\'//*[@id="content"]/text()\')
        if len(article_detail) > 0:
            pd.Series(article_detail).to_csv(\'..\\book\\\'+str(index))
            index=index+1

分类:

技术点:

相关文章: