"""
爬虫工具类
author: 黑色的小火苗
"""
import re
from lxml import etree
import requests
from prettytable import PrettyTable
class SpiderUtils:
"""
爬虫工具类
"""
headers = {
# 设置 UA 反爬
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36",
}
@staticmethod
def get_session(headers=None, head_url=None):
"""
获取一个 session 并设置 headers
:param headers: 请求头
:param head_url:
:return: 返回一个 session
"""
session = requests.session()
if headers is None:
session.headers = SpiderUtils.headers
if head_url is not None:
session.get(head_url)
return session
@staticmethod
def get_text_or_content(url: str, url_type: str, request=None, data=None, encoding: str = "utf8",
get_type: str = "text"):
"""
:param request: 传入 session 则用 session, 若不用 session 传入 None 即可
:param url_type: 请求的类型
:param url: 请求路径
:param data: post 请求所需要的数据
:param encoding: 字符集编码
:param get_type: 获取的类型 默认是 text
:return: response
"""
res = None
if url_type == "get":
if type(request) is requests.sessions.Session:
res = request.get(url)
elif request is None:
res = requests.get(url, headers=SpiderUtils.headers)
if url_type == "post":
if type(request) is requests.sessions.Session:
res = request.post(url, data=data)
elif request is None:
res = requests.post(url, data=data, headers=SpiderUtils.headers)
if get_type == "text":
res.encoding = encoding
return res.text
else:
return res.content
@staticmethod
def get_novel_table(text: str, get_novel_name: str, get_novel_author: str, get_novel_href: str, is_etree=False):
"""
将 response 返回的 text 进行数据的提取
:param text: response
:param get_novel_name: 获取小说名称的 xpath 或 re
:param get_novel_author: 获取作者名称的 xpath 或 re
:param get_novel_href: 获取小说链接的 xpath 或 re
:param is_etree: 如果使用 etree 传入 Ture
:return: 返回一个 table 字典
"""
novel_name_list = None
novel_author_list = None
novel_href_list = None
table = {}
if is_etree is False:
text_str = SpiderUtils.str_replace_none(text)
novel_name_list = re.findall(get_novel_name, text_str)
novel_author_list = re.findall(get_novel_author, text_str)
novel_href_list = re.findall(get_novel_href, text_str)
elif is_etree is True:
etr = etree.HTML(text)
novel_name_list = etr.xpath(get_novel_name)
novel_author_list = etr.xpath(get_novel_author)
novel_href_list = etr.xpath(get_novel_href)
for i in range(len(novel_name_list)):
table[i] = [novel_name_list[i], novel_author_list[i], novel_href_list[i]]
return table
@staticmethod
def get_dict_novel_href(table_dict: dict, index: int, url_head: str = None):
"""
通过索引值找到 请求的路径并返回
:param table_dict: 格式如 {0: [\'圣墟\', \'圣墟\', \'/2_2784/\'], 1: [\'圣墟(辰东)\', \'圣墟(辰东)\', \'/2_2786/\']}
:param index: 要获取 url 的索引
:param url_head: 路径可能是残缺的,可能需要拼接请求头
:return:
"""
url = table_dict.get(index)[2]
if url_head is not None:
return url_head + url
return url
@staticmethod
def get_novel_chapter(text: str, get_chapter_name: str, get_chapter_info: str, is_etree=False):
"""
将 response 返回的 text 进行数据的提取
:param text: response
:param get_chapter_name: 获取章节名称的 xpath 或 re
:param get_chapter_info: 获取章节内容的 xpath 或 re
:param is_etree: 如果使用 etree 传入 Ture
:return: 返回一个列表 一般获取到的章节内容都需要进行一些处理,这里请自己进行处理
"""
chapter_name = None
chapter_info_list = None
if is_etree is False:
text_str = SpiderUtils.str_replace_none(text)
chapter_name = re.findall(get_chapter_name, text_str)[0]
chapter_info_list = re.findall(get_chapter_info, text_str)
elif is_etree is True:
etr = etree.HTML(text)
chapter_name = etr.xpath(get_chapter_name)
chapter_info_list = etr.xpath(get_chapter_info)
return [chapter_name, chapter_info_list]
@staticmethod
def table_print(table_dict: dict):
"""
传入一个 table 的字典
:param table_dict: 格式如 {0: [\'圣墟\', \'圣墟\', \'/2_2784/\'], 1: [\'圣墟(辰东)\', \'圣墟(辰东)\', \'/2_2786/\']}
:return: 无返回值
"""
table = PrettyTable([\'序号\', \'书名\', \'作者名\'])
for i in range(len(table_dict)):
table_list = table_dict.get(i)
table.add_row([i, table_list[0], table_list[1]])
print(table)
@staticmethod
def str_replace_none(text):
"""
将请求回来的数据进行去空格化,将其转换为一个字符串并返回
:param text: response 下的 text
:return: type: str
"""
return text.replace("\r", "").replace("\n", "")
@staticmethod
def str_replace_garbled(chapter_info_list, is_xpath_extract: bool):
"""
处理乱码的情况
:param chapter_info_list: 提取的数据
:param is_xpath_extract: [ xpath 传 True | re 传 False ]
:return: 返回一个字符串
"""
chapter_info = None
if is_xpath_extract is True:
for i in chapter_info_list:
chapter_info += i.replace("\n", "").replace(" ", "").replace("\u3000", "")
if is_xpath_extract is False:
chapter_info = chapter_info_list[0].replace("\u3000", "").replace("<br>", "").replace("<br/>", "")
return chapter_info
相关文章: