liuweimingcprogram

爬取美团网数据

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests

from lib.re_util import ReUtil

base_url = \'http://ns.meituan.com/meishi/b25710/\'

cookies_str = \'\'

cookies_dict = {}
for cookie in cookies_str.split(";"):
    k, v = cookie.split("=", 1)
    cookies_dict[k.strip()] = v.strip()

headers = {
    \'User-Agent\': \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36\'
}

page = requests.get(
    url=base_url,
    cookies=cookies_dict,
    headers=headers
)

def get_element_from_html(raw_html):
    regex = ReUtil.get_regex(begin_with=[\'"poiInfos":\'], end_with=[\'},"comHeader"\'])
    result = regex.findall(raw_html)
    print(result[0][1])
    ans = ""
    for i in range(4):
        ans += result[0][i]
    return result

get_element_from_html(page.text)

 

ReUtil,这个工具其实也够用了,但是还是建议用xPath这种正规的方法来处理HTML

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re


class ReUtil:
    need_escape = {
        \'\\\': True,
        \'^\': True,
        \'$\': True,
        \'.\': True,
        \'*\': True,
        \'+\': True,
        \'?\': True,
        \'{\': True,
        \'}\': True,
        \'(\': True,
        \')\': True,
        \'[\': True,
        \']\': True,
        \'|\': True,
    }
    exits = {}

    @classmethod
    def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> \'re object\':
        begin_with = cls.conver_to_list(begin_with)
        must_contain = cls.conver_to_list(must_contain)
        end_with = cls.conver_to_list(end_with)

        pattern = \'\'
        pattern += cls.list_to_restring(begin_with)
        pattern += \'(.*)?\'
        pattern += cls.list_to_restring(must_contain)
        pattern += \'(.*)?\'
        pattern += cls.list_to_restring(end_with)

        if cls.exits.get(pattern):
            return cls.exits[pattern]
        regex_obj = re.compile(pattern, re.DOTALL)
        cls.exits[pattern] = regex_obj
        return regex_obj

    @classmethod
    def list_to_restring(cls, args: list) -> \'str\':
        ans = \'((?i)\' # ignore capitals
        for i, arg in enumerate(args):
            for j in range(len(arg)):
                if arg[j] in cls.need_escape:
                    ans += \'\\\'
                ans += arg[j]
            if i != len(args) - 1:
                ans += \'|\'
        ans += \')\'
        return ans

    @classmethod
    def conver_to_list(cls, value) -> \'list\':
        return [] if not value else [value] if not isinstance(value, list) else value

    @classmethod
    def get_all_number_to_list(cls, string):
        return re.findall(\'\d+\.?\d*\', string)

 

分类:

技术点:

相关文章:

  • 2021-04-15
  • 2021-12-09
  • 2022-12-23
  • 2021-06-10
  • 2021-12-14
  • 2021-09-07
  • 2021-10-16
  • 2018-05-07
猜你喜欢
  • 2021-07-07
  • 2022-12-23
  • 2021-06-26
  • 2021-07-12
  • 2021-12-13
  • 2021-10-28
  • 2021-12-03
相关资源
相似解决方案