小说爬取
这里爬取的是笔趣阁小说圣墟
注释已经很详细
可以用re也可以用xpath,
代码:
import requests
import re
from lxml import etree
class Novel:
def __init__(self):
self.switch = True
def get_Chapter_url(self):
"""
获取每一章节对应的url
"""
url = "http://www.xbiquge.la/13/13959/"
#headers = {"User-Agent": " Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
html = requests.get(url)
html.encoding = \'utf-8\'
content = html.text
#解析HTML文档xml dom模型
xml = etree.HTML(content)
#返回所有匹配到的,章节链接
link_list = xml.xpath(\'//*[@id="list"]/dl/dd/a/@href\')
#pattern = re.compile("<dd><a href=(.*?).html>第",re.S)
#content_list = pattern.findall(content)
link_tittle = xml.xpath(\'//*[@id="list"]/dl/dd/a/text()\')
#print(link_tittle)
for con in link_list[:10]:
print(con)
self.get_novel_content(con)
def get_novel_content(self,con):
"""
从对应的url中爬取每一章节对应的小说内容
"""
full_url = "http://www.xbiquge.la" + con
print(full_url)
html = requests.get(full_url)
html.encoding = \'utf-8\'
content = html.text
xml = etree.HTML(content)
txt_list = xml.xpath(\'//*[@id="content"]/text()\')
#print(link_list)
# pattern = re.compile(\'<div\sid="content">(.*?)</div>\',re.S)
# txt_list = pattern.findall(content)
#print(txt_list)
for con in txt_list:
#print(con)
self.writePage(con)
print("保存成功!!")
self.writePage("\n""\n""\n""\n""\n""----------------"+"下一章"+"--------------")
def writePage(self,content):
"""
保存每一章节的内容
"""
with open("小说——圣墟.txt","a",encoding = "utf8") as f:
f.write(content)
f.close()
if __name__ == "__main__":
novel = Novel()
novel.get_Chapter_url()