爬取美团网数据
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests from lib.re_util import ReUtil base_url = \'http://ns.meituan.com/meishi/b25710/\' cookies_str = \'\' cookies_dict = {} for cookie in cookies_str.split(";"): k, v = cookie.split("=", 1) cookies_dict[k.strip()] = v.strip() headers = { \'User-Agent\': \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36\' } page = requests.get( url=base_url, cookies=cookies_dict, headers=headers ) def get_element_from_html(raw_html): regex = ReUtil.get_regex(begin_with=[\'"poiInfos":\'], end_with=[\'},"comHeader"\']) result = regex.findall(raw_html) print(result[0][1]) ans = "" for i in range(4): ans += result[0][i] return result get_element_from_html(page.text)
ReUtil,这个工具其实也够用了,但是还是建议用xPath这种正规的方法来处理HTML
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re class ReUtil: need_escape = { \'\\\': True, \'^\': True, \'$\': True, \'.\': True, \'*\': True, \'+\': True, \'?\': True, \'{\': True, \'}\': True, \'(\': True, \')\': True, \'[\': True, \']\': True, \'|\': True, } exits = {} @classmethod def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> \'re object\': begin_with = cls.conver_to_list(begin_with) must_contain = cls.conver_to_list(must_contain) end_with = cls.conver_to_list(end_with) pattern = \'\' pattern += cls.list_to_restring(begin_with) pattern += \'(.*)?\' pattern += cls.list_to_restring(must_contain) pattern += \'(.*)?\' pattern += cls.list_to_restring(end_with) if cls.exits.get(pattern): return cls.exits[pattern] regex_obj = re.compile(pattern, re.DOTALL) cls.exits[pattern] = regex_obj return regex_obj @classmethod def list_to_restring(cls, args: list) -> \'str\': ans = \'((?i)\' # ignore capitals for i, arg in enumerate(args): for j in range(len(arg)): if arg[j] in cls.need_escape: ans += \'\\\' ans += arg[j] if i != len(args) - 1: ans += \'|\' ans += \')\' return ans @classmethod def conver_to_list(cls, value) -> \'list\': return [] if not value else [value] if not isinstance(value, list) else value @classmethod def get_all_number_to_list(cls, string): return re.findall(\'\d+\.?\d*\', string)