用来爬取新浪微博评论数据

 1 # -*- coding:utf-8 -*-
 2 import requests
 3 import json
 4 import re
 5 import os
 6 import gevent
 7 import time
 8 import random
 9 from multiprocessing.dummy import Pool as ThreadPool
10 from bs4 import BeautifulSoup
11 class CommentCrawl(object):
12     \'\'\'
13     用来爬取新浪微博评论数据
14     \'\'\'
15     headers = {
16         \'User-Agent\': \'\',
17         \'Cookie\': \'\'}
18     ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
19     all_comment = []
20     def __init__(self,urlll,file_name):
21         self.urlll = urlll
22         self.file_name=file_name
23     def base62_decode(self,string, alphabet=ALPHABET):
24         base = len(alphabet)
25         strlen = len(string)
26         num = 0
27         idx = 0
28         for char in string:
29             power = (strlen - (idx + 1))
30             num += alphabet.index(char) * (base ** power)
31             idx += 1
32         return num
33 
34     def parser_url(self):
35         code = self.urlll.split(\'?\')[0].split(\'/\')[-1]
36         code1 = code[0]
37         code2 = code[1:5]
38         code3 = code[5:]
39         id1 = self.base62_decode(code1)
40         id2 = self.base62_decode(code2)
41         id3 = self.base62_decode(code3)
42         numList = [id1, id2, id3]
43         plus = \'\'.join(map(str, numList))
44         comment_url = \'http://weibo.com/aj/v6/comment/big?ajwvr=6&id=\'+ plus +\'&root_comment_max_id_type=0&page={}\'
45         return comment_url
46     def get_url_page(self):
47         r = requests.get(self.parser_url().format(1),headers=self.headers)
48         data = json.loads(r.text)
49         total_page = data[\'data\'][\'page\'][\'totalpage\']
50         return total_page
51 
52     def all_urls(self):
53         all_urls = [self.parser_url().format(i + 1) for i in range(self.get_url_page())]
54         return all_urls
55 
56     def comment_parser(self,html):
57         soup = BeautifulSoup(html, \'html.parser\')
58         data = soup.select(\'.WB_text\')
59         comment = [i.text.split(\'：\')[-1] for i in data]
60         return comment
61     def finnal_text(self,url):
62         finnal_all_comment=\'\'.join(self.all_comment)
63         r1 = requests.get(url,headers=self.headers)
64         time.sleep(random.randint(1,5))
65         data1 = json.loads(r1.text)
66         html =data1[\'data\'][\'html\']
67         finnal_data = self.comment_parser(html)
68         self.all_comment+=finnal_data
69         print(len(self.all_comment))
70         return finnal_all_comment
71     def save_file(self,url):
72         path = os.getcwd()
73         filename = self.file_name + \'.txt\'
74         file = path + \'/\' + filename
75         f = open(file, \'a+\', encoding=\'utf-8\')
76         f.write(self.finnal_text(url))
77 
78 if __name__ == "__main__":
79     aa = CommentCrawl(\'http://weibo.com/2202387347/EFdPHe50Z?from=page_1006062202387347_profile&wvr=6&mod=weibotime\',\'小米6发布会\')
80     all_link = aa.all_urls()
81     pool=ThreadPool(4)
82     results = pool.map(aa.save_file,all_link)
83     pool.close()
84     pool.join()