# -*- coding: utf-8 -*-
"""
Created on Tue Dec 1 12:31:07 2020
@author: zhaolulu
"""
import pandas as pd
import requests
from lxml import etree
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}
def url_read(url):
try:
reponse = requests.get(url,headers=headers)
except:
print(\'failed\')
return reponse.content.decode(\'utf-8\')
if __name__==\'__main__\':
#笔趣阁小说网站
url=\'http://www.xbiquge.la/\'
text = url_read(url)
print("============================================")
selector=etree.HTML(text)
#这个是主页上最新小说的url
ret=selector.xpath(\'//*[@id="newscontent"]/div[1]/ul/li/span[2]/a//@href\')
for note_url in ret:
print(note_url)
# 这选择了其中一条url 做测试
#print(ret[0]) # http://www.xbiquge.la/62/62585/
n_text = url_read(\'http://www.xbiquge.la/62/62585/\')
n_html=etree.HTML(n_text)
xpath_ret = n_html.xpath(\'//*[@id="list"]/dl/dd/a/@href\')
index =0;
for t_url in xpath_ret:
#具体的章节内容
f_url = \'http://www.xbiquge.la\'+t_url
print(f_url)
article = url_read(f_url)
article_text=etree.HTML(article)
article_detail=article_text.xpath(\'//*[@id="content"]/text()\')
if len(article_detail) > 0:
pd.Series(article_detail).to_csv(\'..\\book\\\'+str(index))
index=index+1
相关文章: