1.抓包
访问一个频道,Charles抓包,找到真实连接,一般返回json数据和网页中数据对应为真实连接
请求方式为post,所以要添加请求头和表单数据,由于在charles环境下,所以要添加charles的代理ip和认证文件,然后进行测试,访问成功。
对不同的频道分别经过charles抓包,发现请求的链接都是一样的,只是更换了表单中tab_id属性来对应不同的频道,所以创建一个存储tab_id和频道名的字典,更换时从tab_id字典中取值,来实现不同频道的访问
2.封ip测试
一直访问并没有被封,所以就放开了采吧
3.概览页
一般概览页的数据库中存放文章的链接,但是这个百度APP返回数据中含有所有字段,所以将返回的数据全部存放到数据库中。
4.细览页
读取概览页数据库中的数据,通过正则解析出各个字段,去除无效信息,字段加密存放到数据库中
注:charles代理ip自行设置;url只提供一个样子,自行抓取;数据库自行设置;认证文件自行设置;表单数据自行抓取;数据解析模块需要什么自己就解析什么,这里不提供了;处理上还不是很完美,自行修改;
gailanye.py
1 import requests 2 import re 3 import time 4 import pymysql 5 6 7 class BD(object): 8 def __init__(self): 9 self.url = \'https://mbd.baidu.com/searchbox?-此处省略-7ig\' 10 self.form = { 11 \'data\': \'\'\'此处省略。。。 12 13 \'\'\' 14 15 } 16 self.proxy = { 17 \'https\': \'https://此处省略\' 18 } 19 self.channel = { 20 \'1\': \'推荐\', 21 \'3\': \'娱乐\', 22 \'4\': \'体育\', 23 \'5\': \'时尚\', 24 \'6\': \'国际\', 25 \'8\': \'热点\', 26 \'12\': \'汽车\', 27 \'13\': \'军事\', 28 \'14\': \'科技\', 29 \'15\': \'财经\', 30 \'16\': \'游戏\', 31 \'17\': \'女人\', 32 \'18\': \'历史\', 33 \'28\': \'搞笑\', 34 \'35\': \'情感\', 35 \'34\': \'美食\', 36 \'41\': \'居家\', 37 \'42\': \'政务\', 38 \'43\': \'旅游\', 39 \'44\': \'辟谣\', 40 \'51\': \'健康\', 41 \'54\': \'萌宠\', 42 \'72\': \'新华社\', 43 \'75\': \'虎扑\', 44 \'81\': \'澎湃新闻\', 45 \'85\': \'人民日报\', 46 \'106\': \'36氪\', 47 \'88\': \'虎嗅\', 48 \'309999289\': \'上海\', 49 \'309999257\': \'广州\', 50 \'309999340\': \'深圳\', 51 \'309999332\': \'天津\', 52 \'309999179\': \'杭州\', 53 \'309999315\': \'南京\', 54 \'309999218\': \'武汉\', 55 \'109999131\': \'北京\', 56 } 57 58 def modify_tab_id(self, tab_id): 59 # 修改表单中的tab_id 60 self.form[\'data\'] = re.sub(\'"tab_id": "(\d+)"\', \'"tab_id": "{}"\'.format(tab_id), self.form[\'data\']) 61 # self.form[\'data\'] = re.sub(\'"last_update_time": (\d+),\', \'"last_update_time": {}000,\'.format(int(time.time())), self.form[\'data\']) 62 return self.form[\'data\'] 63 64 def get_data(self): 65 # 获得频道和内容 66 list_d = [] 67 for data in self.channel: 68 data_channel = [] 69 print(\'=\'*20) 70 print(data) 71 self.form[\'data\'] = self.modify_tab_id(data) 72 response = requests.post(self.url, data=self.form, proxies=self.proxy, verify=\'*.pem\') 73 datas = response.text 74 channel = self.channel[data] 75 data_channel = [channel, datas] 76 print(data_channel) 77 list_d.append(data_channel) 78 return list_d 79 80 def save_data(self, list_d): 81 # 写入数据库 82 host = \'127.0.0.1\' 83 db = \'bd\' 84 user = \'root\' 85 psd = \'123456\' 86 charset = \'utf8\' 87 88 con = pymysql.connect(host=host, db=db, user=user, passwd=psd, charset=charset) 89 90 cur = con.cursor() 91 92 for i in list_d: 93 print(i) 94 sql = ( 95 "insert into gly(此处省略)" 96 "values(此处省略)") 97 list_m = [i[0], i[1]] # i[0]为频道名 i[1]为数据 98 try: 99 cur.execute(sql, list_m) 100 print(\'insert success\') 101 except Exception as e: 102 print(\'insert error\', e) 103 con.rollback() 104 else: 105 con.commit() 106 cur.close() 107 con.close() 108 109 110 if __name__ == \'__main__\': 111 bd = BD() 112 list_d = bd.get_data() 113 bd.save_data(list_d)
xilanye.py
1 import pymysql 2 import json 3 import time 4 import hashlib 5 import requests 6 from lxml import etree 7 import re 8 9 10 # 娱乐频道先删除掉 11 # 体育频道有导航栏目前还无法获取data,先过滤掉 12 13 14 class XLY(object): 15 def __init__(self): 16 self.no_results_channel = [] # 存储没有数据的频道 17 self.proxy = { 18 \'https\': \'....\' 19 } 20 self.sum_data = 0 21 22 def get_data(self): 23 host = \'127.0.0.1\' 24 db = \'bd\' 25 user = \'root\' 26 pwd = \'123456\' 27 charset = \'utf8\' 28 con = pymysql.connect(host=host, db=db, user=user, passwd=pwd, charset=charset) 29 30 datas = [] 31 cur = con.cursor() 32 sql = \'select * from gly\' 33 try: 34 cur.execute(sql) 35 results = cur.fetchall() 36 i = 0 37 for result in results: 38 i += 1 39 data = [] 40 # 读出来是元组类型,转化为列表返回 41 result = list(result) 42 if \'{"100":[]}\' in result[1]: 43 self.no_results_channel.append(result[0]) 44 print(\'no results channel:\', result[0]) 45 elif \'navigatorItems\' in result[1]: 46 print(\'有导航栏的频道,还没有处理\') 47 else: 48 data = [result[0], result[1]] 49 datas.append(data) 50 print(\'get_data\') 51 print(\'=\' * 20, i) 52 # if i == 5: 53 # break 54 except Exception as e: 55 print(\'error\', e) 56 con.rollback() 57 else: 58 con.commit() 59 return datas 60 61 def parse_data(self, datas): 62 items = [] 63 for data in datas: 64 channel = data[0] 65 channel_data = data[1] 66 channel_data = json.loads(channel_data) 67 channel_data = channel_data[\'data\'][\'100\'][\'itemlist\'][\'items\'] 68 69 for text in channel_data: 70 print(\'=\'*20) 71 item = {} 72 try: 73 mode = text[\'data\'][\'mode\'] 74 except: 75 mode = \'\' 76 print(\'mode not found\') 77 # 根据mode判断是否为文章,过滤掉图集广告 78 if mode == \'text\': 79 此处省略 87 88 m1 = hashlib.md5() 89 m1.update(item[\'urlname\'].encode("utf8")) 90 item[\'hkey\'] = m1.hexdigest() 91 92 try: 93 item[\'comments\'] = text[\'data\'][\'comment_num\'][:-2] 94 except: 95 item[\'comments\'] = \'\' 96 print(\'no comment_num\') 97 98 # 解析content 99 content, url_time = self.parse_content(item[\'urlname\']) 100 101 102 print(item) 103 self.save_data(item) 104 if item != {}: 105 items.append(item) 106 return items 107 108 def parse_content(self, url): 109 # 根据每一篇文章获取content, url_time 110 response = requests.get(url, proxies=self.proxy, verify=\'此处省略.pem\') 111 text = response.text 112 element = etree.HTML(text) 113 contents = element.xpath(\'//p[@class="contentText contentSize contentPadding"]//text()\') 114 url_time = element.xpath(\'//div[@class="infoSet"]//text()\') 115 try: 116 if \'17-\' in url_time: 117 url_time = re.sub(\'17\', \'2018\', url_time) 118 print(url_time) 119 else: 120 url_time = \'2018-\' + str(url_time[1]) 121 except: 122 url_time = \'\' 123 if not contents: 124 contents = \'\' 125 else: 126 contents = \'\'.join(contents) 127 return contents, url_time 128 129 def save_data(self, item): 130 host = \'127.0.0.1\' 131 db = \'bd\' 132 user = \'root\' 133 pwd = \'123456\' 134 charset = \'utf8\' 135 136 con = pymysql.connect(host=host, db=db, user=user, passwd=pwd, charset=charset) 137 cur = con.cursor() 138 sql = \'insert into xly(此处省略)\' \ 139 \'values(此处省略)\' 140 list = [此处省略] 142 try: 143 cur.execute(sql, list) 144 print(\'insert success\') 145 self.sum_data += 1 146 print(\'成功插入数据库第{}条\'.format(self.sum_data)) 147 except Exception as e: 148 print(\'error~~\', e) 149 con.rollback() 150 else: 151 con.commit() 152 # cur.execute(sql, list) 153 cur.close() 154 con.close() 155 156 157 if __name__ == \'__main__\': 158 xly = XLY() 159 datas = xly.get_data() 160 items = xly.parse_data(datas)