dluo

今日头条如今在自媒体领域算是比较强大的存在,今天就带大家利用python爬去今日头条的热点新闻,理论上是可以做到无限爬取的;

在浏览器中打开今日头条的链接,选中左侧的热点,在浏览器开发者模式network下很快能找到一个‘?category=new_hot...’字样的文件,查看该文件发现新闻内容的数据全部存储在data里面,且能发现数据类型为json;如下图:

这样一来就简单了,只要找到这个文件的requests url即可通过python requests来爬取网页了;

查看请求的url,如下图:

发现链接为:https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1B5AC16548E0FA&cp=5C647E601F9AEE1&_signature=F09fYAAASzBjiSc9oUU9MxdPX3

其中有9个参数,对比如下表:

其中max_behot_time在获取的json数据中获得,具体数据见如下截图:

 

在网上找了下大神对as和cp算法的分析,发现两个参数在js文件:home_4abea46.js中有,具体算法如下代码:

!function(t) {
    var e = {};
    e.getHoney = function() {
        var t = Math.floor((new Date).getTime() / 1e3)
          , e = t.toString(16).toUpperCase()
          , i = md5(t).toString().toUpperCase();
        if (8 != e.length)
            return {
                as: "479BB4B7254C150",
                cp: "7E0AC8874BB0985"
            };
        for (var n = i.slice(0, 5), a = i.slice(-5), s = "", o = 0; 5 > o; o++)
            s += n[o] + e[o];
        for (var r = "", c = 0; 5 > c; c++)
            r += e[c + 3] + a[c];
        return {
            as: "A1" + s + e.slice(-3),
            cp: e.slice(0, 3) + r + "E1"
        }
    }
    ,
    t.ascp = e
}(window, document),

 python获取as和cp值的代码如下:(代码参考blog:https://www.cnblogs.com/xuchunlin/p/7097391.html)

def get_as_cp():  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
	zz = {}
	now = round(time.time())
	print(now) # 获取当前计算机时间
	e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示
	print(\'e:\', e)
	a = hashlib.md5()  #hashlib.md5().hexdigest()创建hash对象并返回16进制结果
	print(\'a:\', a)
	a.update(str(int(now)).encode(\'utf-8\'))
	i = a.hexdigest().upper()
	print(\'i:\', i)
	if len(e)!=8:
		zz = {\'as\':\'479BB4B7254C150\',
		\'cp\':\'7E0AC8874BB0985\'}
		return zz
	n = i[:5]
	a = i[-5:]
	r = \'\'
	s = \'\'
	for i in range(5):
		s= s+n[i]+e[i]
	for j in range(5):
		r = r+e[j+3]+a[j]
	zz ={
	\'as\':\'A1\'+s+e[-3:],
	\'cp\':e[0:3]+r+\'E1\'
	}
	print(\'zz:\', zz)
	return zz

  这样完整的链接就构成了,另外提一点就是:_signature参数去掉也是可以获取到json数据的,因此这样请求的链接就完成了;下面附上完整代码:

import requests
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime

start_url = \'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=\'
url = \'https://www.toutiao.com\'

headers={
	\'user-agent\':\'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36\'
}
cookies = {\'tt_webid\':\'6649949084894053895\'} # 此处cookies可从浏览器中查找,为了避免被头条禁止爬虫

max_behot_time = \'0\'   # 链接参数
title = []       # 存储新闻标题
source_url = []  # 存储新闻的链接
s_url = []       # 存储新闻的完整链接
source = []      # 存储发布新闻的公众号
media_url = {}   # 存储公众号的完整链接


def get_as_cp():  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
	zz = {}
	now = round(time.time())
	print(now) # 获取当前计算机时间
	e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示
	print(\'e:\', e)
	a = hashlib.md5()  #hashlib.md5().hexdigest()创建hash对象并返回16进制结果
	print(\'a:\', a)
	a.update(str(int(now)).encode(\'utf-8\'))
	i = a.hexdigest().upper()
	print(\'i:\', i)
	if len(e)!=8:
		zz = {\'as\':\'479BB4B7254C150\',
		\'cp\':\'7E0AC8874BB0985\'}
		return zz
	n = i[:5]
	a = i[-5:]
	r = \'\'
	s = \'\'
	for i in range(5):
		s= s+n[i]+e[i]
	for j in range(5):
		r = r+e[j+3]+a[j]
	zz ={
	\'as\':\'A1\'+s+e[-3:],
	\'cp\':e[0:3]+r+\'E1\'
	}
	print(\'zz:\', zz)
	return zz


def getdata(url, headers, cookies):  # 解析网页函数
	r = requests.get(url, headers=headers, cookies=cookies)
	print(url)
	data = json.loads(r.text)
	return data


def savedata(title, s_url, source, media_url):  # 存储数据到文件
	# 存储数据到xlxs文件
	wb = Workbook()
	if not os.path.isdir(os.getcwd()+\'/result\'):   # 判断文件夹是否存在
		os.makedirs(os.getcwd()+\'/result\') # 新建存储文件夹
	filename = os.getcwd()+\'/result/result-\'+datetime.datetime.now().strftime(\'%Y-%m-%d-%H-%m\')+\'.xlsx\' # 新建存储结果的excel文件
	ws = wb.active
	ws.title = \'data\'   # 更改工作表的标题
	ws[\'A1\'] = \'标题\'   # 对表格加入标题
	ws[\'B1\'] = \'新闻链接\'
	ws[\'C1\'] = \'头条号\'
	ws[\'D1\'] = \'头条号链接\'
	for row in range(2, len(title)+2):   # 将数据写入表格
		_= ws.cell(column=1, row=row, value=title[row-2])
		_= ws.cell(column=2, row=row, value=s_url[row-2])
		_= ws.cell(column=3, row=row, value=source[row-2])
		_= ws.cell(column=4, row=row, value=media_url[source[row-2]])

	wb.save(filename=filename)  # 保存文件



def main(max_behot_time, title, source_url, s_url, source, media_url):   # 主函数
	for i in range(3):   # 此处的数字类似于你刷新新闻的次数,正常情况下刷新一次会出现10条新闻,但夜存在少于10条的情况;所以最后的结果并不一定是10的倍数
		ascp = get_as_cp()    # 获取as和cp参数的函数
		demo = getdata(start_url+max_behot_time+\'&max_behot_time_tmp=\'+max_behot_time+\'&tadrequire=true&as=\'+ascp[\'as\']+\'&cp=\'+ascp[\'cp\'], headers, cookies)
		print(demo)
		# time.sleep(1)
		for j in range(len(demo[\'data\'])):
			# print(demo[\'data\'][j][\'title\'])
			if demo[\'data\'][j][\'title\'] not in title:
				title.append(demo[\'data\'][j][\'title\'])  # 获取新闻标题
				source_url.append(demo[\'data\'][j][\'source_url\'])  # 获取新闻链接
				source.append(demo[\'data\'][j][\'source\'])  # 获取发布新闻的公众号
			if demo[\'data\'][j][\'source\'] not in media_url:
				media_url[demo[\'data\'][j][\'source\']] = url+demo[\'data\'][j][\'media_url\']  # 获取公众号链接
		print(max_behot_time)
		max_behot_time = str(demo[\'next\'][\'max_behot_time\'])  # 获取下一个链接的max_behot_time参数的值
		for index in range(len(title)):
			print(\'标题:\', title[index])
			if \'https\' not in source_url[index]:
				s_url.append(url+source_url[index])
				print(\'新闻链接:\', url+source_url[index])
			else:
				print(\'新闻链接:\', source_url[index])
				s_url.append(source_url[index])
				# print(\'源链接:\', url+source_url[index])
			print(\'头条号:\', source[index])
			print(len(title))   # 获取的新闻数量

if __name__ == \'__main__\':
	main(max_behot_time, title, source_url, s_url, source, media_url)
	savedata(title, s_url, source, media_url)

  简单百行代码搞定今日头条热点新闻爬取并存储到本地,同理也可以爬取其他频道的新闻;本次的爬取程序到此结束,下次从爬取的公众号对公众号下的新闻进行爬取,主要爬取公众号的粉丝量以及最近10条新闻的或图文的阅读量及评论数等数据;请期待...

最后送上程序运行的截图及数据存储的表格截图:

---------------------------------------------------------

欢迎大家留言交流,共同进步。

 

分类:

技术点:

相关文章: