Python-爬取微博信息

# -*- coding: utf-8 -*-
import requests, re
import time
import os
import csv
import sys
import importlib
from fake_useragent import UserAgent

importlib.reload(sys)
class WeiBoSpider():
    def __init__(self, page):
        self.path = os.getcwd() + "/weibo.csv"
        self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig")
        self.writer = csv.writer(self.csvfile)
        # csv头部
        self.writer.writerow((\'话题链接\', \'话题内容\', \'楼主ID\', \'楼主昵称\', \'楼主性别\', \'发布日期\',
                 \'发布时间\', \'转发量\', \'评论量\', \'点赞量\', \'评论者ID\', \'评论者昵称\',
                 \'评论者性别\', \'评论日期\', \'评论时间\', \'评论内容\'))
        self.headers = {
    \'Cookie\': \'_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431\',
    \'Referer\': \'https://m.weibo.cn/detail/4312409864846621\',
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36\',
    \'X-Requested-With\': \'XMLHttpRequest\'
}
        self.comments_ID = []
        self.page = page

    def get_title_id(self):
        # 获取内容中的id列表
        for page in range(1, self.page):
            self.headers = {
                "User-Agent": UserAgent().chrome
            }
            time.sleep(1)
            api_url = \'https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=\' + str(page)
            rep = requests.get(url=api_url, headers=self.headers)
            # 获取ID值并写入列表comment_ID中
            for json in rep.json()[\'data\'][\'statuses\']:
                comment_ID = json[\'id\']
                self.comments_ID.append(comment_ID)

    def spider_title(self, id):
        """爬取战役情每个主题的详情页面"""
        try:
            title_url = \'https://m.weibo.cn/detail/\' + str(id)
            html_text = requests.get(url=title_url, headers=self.headers).text
            # 内容
            title = re.findall(\'.*?"text": "(.*?)",.*?\', html_text)[0]
            # 去掉title中的html标签
            text = re.sub(\'<(S*?)[^>]*>.*?|<.*? />\', \'\', title)
            # 用户id
            user_id = re.findall(\'.*?"id": "(.*?)",.*?\', html_text)[0]
            # 用户昵称
            user_nicname = re.findall(\'.*?"screen_name": "(.*?)",.*?\', html_text)[0]
            # 性别
            user_gender = re.findall(\'.*?"gender": "(.*?)",.*?\', html_text)[0]
            # 发布时间
            created_title_time = re.findall(\'.*?"created_at": "(.*?)",.*?\', html_text)[0].split(" ")
            # 日期
            if \'Mar\' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], \'03\', created_title_time[2])
            elif \'Feb\' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], \'02\', created_title_time[2])
            elif \'Jan\' in created_title_time:
                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], \'01\', created_title_time[2])
            else:
                pass
            # 发布时间
            add_title_time = created_title_time[3]
            # 转发量
            reposts_count = re.findall(\'.*?"reposts_count": (.*?),.*?\', html_text)[0]
            # 评论量
            comments_count = re.findall(\'.*?"comments_count": (.*?),.*?\', html_text)[0]
            # 点赞量
            attitudes_count = re.findall(\'.*?"attitudes_count": (.*?),.*?\', html_text)[0]

            comment_count = int(int(comments_count) / 20)  # 每个ajax一次加载20条数据
            position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                         add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ")
            # 写入数据
            print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                         add_title_time, reposts_count, comments_count, attitudes_count)
            self.writer.writerow((position1))
            return comment_count
        except:
            pass


    def get_page(self, id, max_id, id_type):
        # 抓取评论信息
        params = {
            \'max_id\': max_id,
            \'max_id_type\': id_type
        }
        url = \'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id\'.format(id, id )
        try:
            r = requests.get(url, params=params, headers=self.headers)
            if r.status_code == 200:
                return r.json()
        except requests.ConnectionError as e:
            print(\'error\', e.args)
            pass

    def parse_page(self, jsondata):
        if jsondata:
            items = jsondata.get(\'data\')
            item_max_id = {}
            item_max_id[\'max_id\'] = items[\'max_id\']
            item_max_id[\'max_id_type\'] = items[\'max_id_type\']
            return item_max_id

    def write_csv(self, jsondata):
        for json in jsondata[\'data\'][\'data\']:
            # 用户ID
            user_id = json[\'user\'][\'id\']
            # 用户昵称
            user_name = json[\'user\'][\'screen_name\']
            # 用户性别,m表示男性，表示女性
            user_gender = json[\'user\'][\'gender\']
            # 获取评论
            comments_text = json[\'text\']
            comment_text = re.sub(\'<(S*?)[^>]*>.*?|<.*? />\', \'\', comments_text)  # 正则匹配掉html标签
            # 评论时间
            created_times = json[\'created_at\'].split(\' \')
            if \'Feb\' in created_times:
                created_YMD = "{}/{}/{}".format(created_times[-1], \'02\', created_times[2])
            elif \'Jan\' in created_times:
                created_YMD = "{}/{}/{}".format(created_times[-1], \'01\', created_times[2])
            else:
                print(\'该时间不在疫情范围内，估计数据有误！\')
                pass
            created_time = created_times[3]  # 评论时间时分秒
            position2 = (
                " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD,
                created_time,
                comment_text)
            self.writer.writerow((position2))  # 写入数据

    def main(self):
        self.get_title_id()
        count_title = len(self.comments_ID)
        for count, comment_ID in enumerate(self.comments_ID):
            print("正在爬取第%s个话题，一共找到个%s话题需要爬取" % (count + 1, count_title))
            # maxPage获取返回的最大评论数量
            maxPage = self.spider_title(comment_ID)
            m_id = 0
            id_type = 0
            if maxPage != 0:  # 小于20条评论的不需要循环
                try:
                    # 用评论数量控制循环
                    for page in range(0, maxPage):
                        # 自定义函数-抓取网页评论信息
                        jsondata = self.get_page(comment_ID, m_id, id_type)
                        # 自定义函数-写入CSV文件
                        self.write_csv(jsondata)
                        # 自定义函数-获取评论item最大值
                        results = self.parse_page(jsondata)
                        time.sleep(1)
                        m_id = results[\'max_id\']
                        id_type = results[\'max_id_type\']
                except:
                    pass
            print("--------------------------分隔符---------------------------")
        self.csvfile.close()


if __name__ == \'__main__\':
    startTime = time.time()
    spider = WeiBoSpider(15)
    spider.main()
    endTime = time.time()
    useTime = (endTime - startTime) / 60
    print("该次所获的信息一共使用%s分钟" % useTime)