微信公众号爬虫--历史文章

今天搞了一个微信公众号历史文章爬虫的demo，亲测可行，记录一下！（不喜勿喷）

缺点：1.不是很智能 2. 兼容性不是很好，但是能应付正常情况啦

使用mysql+request

数据库部分

直接建表ddl吧：

CREATE TABLE `wechat_content` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `wechat_name` varchar(255) DEFAULT NULL COMMENT \'公众号名字\',
  `title` varchar(225) DEFAULT NULL COMMENT \'文章标题\',
  `content_url` varchar(1000) DEFAULT NULL COMMENT \'文章地址\',
  `cover` varchar(1000) DEFAULT NULL COMMENT \'封面图\',
  `source_url` varchar(1000) DEFAULT NULL COMMENT \'转载url\',
  `source_name` varchar(255) DEFAULT NULL COMMENT \'转载公众号名\',
  `datetime` varchar(255) DEFAULT NULL COMMENT \'发布时间\',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1629 DEFAULT CHARSET=utf8

def get_sqlConn():
    try:
        conn = pymysql.connect(
            # host=\'localhost\',
            host = ip,
            port=3306,
            user=\'root\',
            password=密码,
            db=\'py_mysql_test\',
            charset=\'utf8\'
        )
        print(\'数据库连接成功！\')
        return conn
    except:
        print(\'error\')

插入方法：

def insert_wechat_content(wechat_name,title,content_url,cover,source_url,source_name,datetime):
    try:
        conn = get_sqlConn()
        cur = conn.cursor()
        #     sql = "INSERT INTO anjuke_beijing_onenum_all_house(house_name,house_plate,house_url,create_time) VALUES (%s,%s,%s,%s)"
        sql = """INSERT INTO wechat_content(wechat_name,title,content_url,cover,source_url,source_name,datetime) VALUES (%s,%s,%s,%s,%s,%s,%s)""" % (wechat_name,title,content_url,cover,source_url,source_name,datetime)
        print("微信公众号插入sql:%s" % sql)
        cur.execute(sql)
        conn.commit()
        print(\'插入数据成功！\')
    except Exception as e:
        print(\'插入发生数据错误！ERROR:%s\' % e)
        conn.rollback()         # 回滚
    finally:
        cur.close()
        conn.close()
    print(\'操作数据库完毕！\')
为了防止有重复的，还有一个查询方法：

def select_wechat_content(title):
    conn = get_sqlConn()
    cur = conn.cursor()
    try:
        sql = "SELECT EXISTS(SELECT 1 FROM wechat_content WHERE title=%s)" % title
        print("微信公众号查询SQL：%s" % sql)
        cur.execute(sql)
        return cur.fetchall()[0]
    except Exception as e:
        print(\'查询发生数据错误！ERROR:%s\' % e)
        conn.rollback()
    cur.close()
    conn.close()

Python 爬取部分

首先分析一下，通过charles抓包可以看到历史文章的请求

/mp/profile_ext?action=getmsg&__biz=MzU0NDQ2OTkzNw==&f=json&offset=17&count=10&is_ok=1&scene=&uin=MTU4MzgxNjcwNg%3D%3D&key=5a37b8e9f2933463aa4c791beaedc828c781ae48f9a58c2067595d03e2a4da3d43e47af1b87aea58849a45838a5cd1375e69afd980a0562d3327ff9a7227684fa872ad73ae54f8d9ae5b2392595e0a4d&pass_ticket=n9Zz%2F2GEUA9SBL%2FLVdK8uLAPMwNph3rMVVksmgD0xrMOstqSxkc%2BaMVRVnfNAC9M&wxtoken=&appmsg_token=1030_sVyKhffomeHucF5TrTgG3CyPO9kX-j3obN4DNg~~&x5=0&f=json

这是历史文章的请求接口，通过分析我们得知需要动态获取的参数有：

__biz : 用户和公众号之间的唯一id，
uin ：用户的私密id
key ：请求的秘钥，一段时候只会就会失效。
offset ：偏移量
count ：每次请求的条数
我的做法是直接直接拿到请求的url，然后解析url中的参数，得到请求的params参数，方法见下：
__biz 应为后面又==，所以最后又重新赋值了

def get_parms(u):
    data = u.split("&")
    parms = {}
    for i in data:
        d = i.split("=")
        parms[d[0]] = d[1]
    parms[\'__biz\'] = parms[\'__biz\']+"=="
    print(parms)
    return parms
逻辑部分

def get_wx_article(u,wechat_name,index=0, count=10):

    """
    :param u:  抓包获取的请求地址，不要/mp/profile_ext?
    :param wechat_name: 公众号名，往数据库保存使用
    :param index: 翻页
    :param count: 每次请求条数
    :return:
    """

    offset = (index + 1) * count
    url = "http://mp.weixin.qq.com/mp/profile_ext?"

    params = get_parms(u)
    params[\'offset\'] = offset
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\',
        \'Cookie\': \'rewardsn=; wxtokenkey=777; wxuin=1583816706; devicetype=Windows10; version=62070141; lang=zh_CN; pass_ticket=n9Zz/2GEUA9SBL/LVdK8uLAPMwNph3rMVVksmgD0xrMOstqSxkc+aMVRVnfNAC9M; wap_sid2=CILAnPMFElw5Z0w3VXRGdjhNTlF4Ujd0YXFUSjM0MUpkSGFkcUdHTC0wa08tcUR3aEtWZElvcGRwTnUtUjllbHRTU3ctZ0JJQkR0RW1TZjgwNVZZd1RCaTMwNkZSd1lFQUFBfjCjgIDtBTgNQJVO\'
    }

    response = requests.get(url=url,params=params, headers=headers)
    print(response.text)
    resp_json = response.json()
    if resp_json.get(\'errmsg\') == \'ok\':
        resp_json = response.json()
        # 是否还有分页数据， 用于判断return的值
        can_msg_continue = resp_json[\'can_msg_continue\']
        # 当前分页文章数
        msg_count = resp_json[\'msg_count\']
        general_msg_list = json.loads(resp_json[\'general_msg_list\'])
        list = general_msg_list.get(\'list\')
        print(list, "**************")
        wechat_name = wechat_name
        wechat_name = "\'{}\'".format(wechat_name)
        for i in list:
            print("=====>%s"  % i)
            if \'app_msg_ext_info\' not in i:# 有特殊的公众号没有app_msg_ext_info字段，如果没有就跳过

                continue
            app_msg_ext_info = i[\'app_msg_ext_info\']
            # 标题
            title = app_msg_ext_info[\'title\']
            title = "\'{}\'".format(title)
            # 文章地址
            content_url = app_msg_ext_info[\'content_url\']
            content_url = "\'{}\'".format(content_url)
            # 封面图
            cover = app_msg_ext_info[\'cover\']
            cover = "\'{}\'".format(cover)
            # 转载路径
            source_url = app_msg_ext_info[\'source_url\']
            source_url = "\'{}\'".format(source_url)

            # 转载公众号
            source_name = app_msg_ext_info[\'author\']
            source_name = "\'{}\'".format(source_name)

            # 发布时间
            datetime = i[\'comm_msg_info\'][\'datetime\']
            datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))
            datetime = "\'{}\'".format(datetime)

            print(title,content_url)
            print(source_url,source_name)
            print(cover,datetime)
            if select_wechat_content(title) == 1:  # 防止数据重复
                print("数据已经存在")
            else:
                insert_wechat_content(wechat_name,title,content_url,cover,source_url,source_name,datetime)

        if can_msg_continue == 1:
            return True
        return False
    else:
        print(\'获取文章异常...\')
        return False

运行代码

if __name__ == \'__main__\':
    index = 0
    u = "action=getmsg&__biz=MzU0NDQ2OTkzNw==&f=json&offset=17&count=10&is_ok=1&scene=&uin=MTU4MzgxNjcwNg%3D%3D&key=5a37b8e9f2933463aa4c791beaedc828c781ae48f9a58c2067595d03e2a4da3d43e47af1b87aea58849a45838a5cd1375e69afd980a0562d3327ff9a7227684fa872ad73ae54f8d9ae5b2392595e0a4d&pass_ticket=n9Zz%2F2GEUA9SBL%2FLVdK8uLAPMwNph3rMVVksmgD0xrMOstqSxkc%2BaMVRVnfNAC9M&wxtoken=&appmsg_token=1030_sVyKhffomeHucF5TrTgG3CyPO9kX-j3obN4DNg~~&x5=0&f=json"
    while 1:
        print(f\'开始抓取公众号第{index + 1} 页文章.\')
        flag = get_wx_article(u, "Python学习开发", index=index)
        # 防止和谐，暂停8秒
        time.sleep(8)
        index += 1
        if not flag:
            print(\'公众号文章已全部抓取完毕，退出程序.\')
            break

        print(f\'..........准备抓取公众号第{index + 1} 页文章.\')

最终效果：

原文链接：https://www.cnblogs.com/cxiaolong/p/11318439.html

后续可以再把首页的文章添加进去，敬请期待