blackFlame
"""
    爬虫工具类
    author: 黑色的小火苗
"""
import re
from lxml import etree
import requests
from prettytable import PrettyTable


class SpiderUtils:
    """
        爬虫工具类
    """

    headers = {
        # 设置 UA 反爬
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/96.0.4664.110 Safari/537.36",
    }

    @staticmethod
    def get_session(headers=None, head_url=None):
        """
        获取一个 session 并设置 headers
        :param headers: 请求头
        :param head_url:
        :return: 返回一个 session
        """
        session = requests.session()
        if headers is None:
            session.headers = SpiderUtils.headers
        if head_url is not None:
            session.get(head_url)
        return session

    @staticmethod
    def get_text_or_content(url: str, url_type: str, request=None, data=None, encoding: str = "utf8",
                            get_type: str = "text"):
        """
        :param request: 传入 session 则用 session, 若不用 session 传入 None 即可
        :param url_type: 请求的类型
        :param url: 请求路径
        :param data: post 请求所需要的数据
        :param encoding: 字符集编码
        :param get_type: 获取的类型 默认是 text
        :return: response
        """
        res = None
        if url_type == "get":
            if type(request) is requests.sessions.Session:
                res = request.get(url)
            elif request is None:
                res = requests.get(url, headers=SpiderUtils.headers)
        if url_type == "post":
            if type(request) is requests.sessions.Session:
                res = request.post(url, data=data)
            elif request is None:
                res = requests.post(url, data=data, headers=SpiderUtils.headers)
        if get_type == "text":
            res.encoding = encoding
            return res.text
        else:
            return res.content

    @staticmethod
    def get_novel_table(text: str, get_novel_name: str, get_novel_author: str, get_novel_href: str, is_etree=False):
        """
        将 response 返回的 text 进行数据的提取
        :param text: response
        :param get_novel_name: 获取小说名称的 xpath 或 re
        :param get_novel_author: 获取作者名称的 xpath 或 re
        :param get_novel_href: 获取小说链接的 xpath 或 re
        :param is_etree: 如果使用  etree 传入 Ture
        :return: 返回一个 table 字典
        """
        novel_name_list = None
        novel_author_list = None
        novel_href_list = None
        table = {}
        if is_etree is False:
            text_str = SpiderUtils.str_replace_none(text)
            novel_name_list = re.findall(get_novel_name, text_str)
            novel_author_list = re.findall(get_novel_author, text_str)
            novel_href_list = re.findall(get_novel_href, text_str)
        elif is_etree is True:
            etr = etree.HTML(text)
            novel_name_list = etr.xpath(get_novel_name)
            novel_author_list = etr.xpath(get_novel_author)
            novel_href_list = etr.xpath(get_novel_href)
        for i in range(len(novel_name_list)):
            table[i] = [novel_name_list[i], novel_author_list[i], novel_href_list[i]]
        return table

    @staticmethod
    def get_dict_novel_href(table_dict: dict, index: int, url_head: str = None):
        """
        通过索引值找到 请求的路径并返回
        :param table_dict: 格式如 {0: [\'圣墟\', \'圣墟\', \'/2_2784/\'], 1: [\'圣墟(辰东)\', \'圣墟(辰东)\', \'/2_2786/\']}
        :param index: 要获取 url 的索引
        :param url_head: 路径可能是残缺的,可能需要拼接请求头
        :return:
        """
        url = table_dict.get(index)[2]
        if url_head is not None:
            return url_head + url
        return url

    @staticmethod
    def get_novel_chapter(text: str, get_chapter_name: str, get_chapter_info: str, is_etree=False):
        """
        将 response 返回的 text 进行数据的提取
        :param text: response
        :param get_chapter_name: 获取章节名称的 xpath 或 re
        :param get_chapter_info: 获取章节内容的 xpath 或 re
        :param is_etree: 如果使用  etree 传入 Ture
        :return: 返回一个列表 一般获取到的章节内容都需要进行一些处理,这里请自己进行处理
        """
        chapter_name = None
        chapter_info_list = None
        if is_etree is False:
            text_str = SpiderUtils.str_replace_none(text)
            chapter_name = re.findall(get_chapter_name, text_str)[0]
            chapter_info_list = re.findall(get_chapter_info, text_str)
        elif is_etree is True:
            etr = etree.HTML(text)
            chapter_name = etr.xpath(get_chapter_name)
            chapter_info_list = etr.xpath(get_chapter_info)
        return [chapter_name, chapter_info_list]

    @staticmethod
    def table_print(table_dict: dict):
        """
        传入一个 table 的字典
        :param table_dict: 格式如 {0: [\'圣墟\', \'圣墟\', \'/2_2784/\'], 1: [\'圣墟(辰东)\', \'圣墟(辰东)\', \'/2_2786/\']}
        :return: 无返回值
        """
        table = PrettyTable([\'序号\', \'书名\', \'作者名\'])
        for i in range(len(table_dict)):
            table_list = table_dict.get(i)
            table.add_row([i, table_list[0], table_list[1]])
        print(table)

    @staticmethod
    def str_replace_none(text):
        """
        将请求回来的数据进行去空格化,将其转换为一个字符串并返回
        :param text: response 下的 text
        :return: type: str
        """
        return text.replace("\r", "").replace("\n", "")

    @staticmethod
    def str_replace_garbled(chapter_info_list, is_xpath_extract: bool):
        """
        处理乱码的情况
        :param chapter_info_list: 提取的数据
        :param is_xpath_extract: [ xpath 传 True | re 传 False ]
        :return: 返回一个字符串
        """
        chapter_info = None
        if is_xpath_extract is True:
            for i in chapter_info_list:
                chapter_info += i.replace("\n", "").replace(" ", "").replace("\u3000", "")
        if is_xpath_extract is False:
            chapter_info = chapter_info_list[0].replace("\u3000", "").replace("<br>", "").replace("<br/>", "")
        return chapter_info

分类:

技术点:

相关文章: