一、配置
首先对fiddler和模拟器进行配置,下篇具体讲述
二、测试、分析
点击搜索后 fiddler上就有了个post请求
而用户内的内容已经给出,看如下json数据 aweme_count=103 。103个内容
现在进入主页
aweme_list 下每一个{}就是一个短视频 使用鼠标一直下拉会有多个json链接出现
我们的视频链接在:aweme_list中,每个视频下的video下的play_addr下的url_list中,忽悠6个url链接,都是同样的视频,个人理解这应该是应对各个场合的需求。
刚进入主页的同时 只有最初的20个视频,我们可以向下滑动加载,也可以自动化来向下模拟滑动,就会不断的出现如上图加载的json数据包
将json数据包保存到本地,fiddler提供了一个自带的脚本,在里面添加规则,当视频json包刷出来后自动保存json包
方法一:
菜单栏找到 Rules => Customize
找到 OnBeforeResponse
在下面添加规则:
filename 是本地保存路径
strBody 是获取到的json数据包,
-
# 抖音会经常变动这个url,所以需要经常改 -
oSession.uriContains("https://aweme-hl.snssdk.com/aweme/v1/aweme/post/")
注意: 有一点要注意的就是 有一个json数据包 写入一个,有多个就写入多个,这多个json数据写入进去后 每个包和每个包中间没有任何内容 就像是拼接成的字符串,在提取的时候会报错。我用了如下方法解决的 此方法有点low 各位大佬有好的方法可以不吝赐教。
至于我所添加的比较low的"aaaaa" ,主要作用是 便于正则匹配。
-
static function OnBeforeResponse(oSession: Session) { -
if (m_Hide304s && oSession.responseCode == 304) { -
oSession["ui-hide"] = "true"; -
} -
if (oSession.uriContains("https://aweme-hl.snssdk.com/aweme/v1/aweme/post/")){ -
var strBody=oSession.GetResponseBodyAsString(); -
var sps = oSession.PathAndQuery.slice(-58,); -
//FiddlerObject.alert(sps) -
var filename = "C:/Users/Administrator/Desktop/抖音爬虫json/raw_data" + "/" + sps + ".json"; -
var curDate = new Date(); -
var sw : System.IO.StreamWriter; -
if (System.IO.File.Exists(filename)){ -
sw = System.IO.File.AppendText(filename); -
sw.Write("aaaaa"+ strBody + "aaaaa"); -
} -
else{ -
sw = System.IO.File.CreateText(filename); -
sw.Write('aaaaa' + strBody + 'aaaaa'); -
} -
sw.Close(); -
sw.Dispose(); -
} -
}
比如:json数据为
-
#比如下面是初始添加的一条数据 -
j1 = '{"li": {"zzz":"111","xxx":"222","ccc":"333"}}' -
#然后又添加一条数据 -
j2 = '{"li": {"zzz":"111","xxx":"222","ccc":"333"}}{"li":{"zzz":"111","xxx":"222","ccc":"333"}}' -
#这就导致了 在解析json数据上会出现错误 从而解析不出来 获取不到所想要的内容,并且也遍历不出来 -
#所以 在每个json数据前后都添加一些字符,字符不固定,主要是以便于能够更好的提取出出来。可能也有更好的办法。 -
j3 = 'aaaaa{"li": {"zzz":"111","xxx":"222","ccc":"333"}}aaaaaaaaaa{"li":{"zzz":"111","xxx":"222","ccc":"333"}}aaaaa' -
#正则匹配 -
p = r'"aaaaa"(.*?})"aaaaa"' -
ss = re.findall(p,a) -
#输出为:['{"li": {"zzz":"111","xxx":"222","ccc":"333"}}', '{"li":{"zzz":"111","xxx":"222","ccc":"333"}}'] -
#这样就可以遍历内容提取数据
第二个输入方法:
打开他 也是把规则写进去 就OK了
而后你在重新进入主页 你所保存的json数据包就出现了。
三、撸代码
-
import os, json, requests, re, random -
# 伪装头 -
USER_AGENTS = [ -
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", -
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", -
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", -
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", -
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", -
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", -
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", -
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", -
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", -
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", -
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", -
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", -
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", -
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", -
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", -
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", -
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", -
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", -
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", -
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", -
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", -
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", -
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", -
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", -
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", -
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", -
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", -
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", -
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" -
] -
headers = {'User-Agent': random.choice(USER_AGENTS)} -
# 获取文件夹内所有json包名 -
videos_list = os.listdir('C:/Users/Administrator/Desktop/抖音爬虫json/raw_data/') -
# 用来方便查看视频总数 -
count = 1 -
# 循环json列表,对每个json包进行操作 -
for videos in videos_list: -
# 用来拼接视频名称,使其不出现重复video名 -
numb = 1 -
# 打开json包 -
a = open('C:/Users/Administrator/Desktop/抖音爬虫json/raw_data/{}'.format(videos), encoding='utf-8') -
# 获取文件内容 -
a = a.read() -
# 使用正则匹配出里面的数据 -
p = r'aaaaa(.*?})aaaaa' -
ss = re.findall(p, a) -
for i in ss: -
# 取出json包中所有视频 -
content = json.loads(i)['aweme_list'] -
# 循环视频列表,选取每个视频 -
for video in content: -
# 获取用户名称 -
vieo_name = video['author']['nickname'] -
# 获取用户id -
vieo_id = video['author']['short_id'] -
# 获取视频url,每条数据有6个url,都是一样的视频 -
video_url = video['video']['play_addr']['url_list'][0] -
# 获取视频二进制代码 -
videoMp4 = requests.request('get', video_url, headers=headers).content -
#查询目标文件夹 -
isExists = os.path.exists('C:/Users/Administrator/Desktop/抖音爬虫json/VIDEO/{}'.format(vieo_name)) -
# 判断是否有次文件夹,没有就创建 -
if not isExists: -
os.makedirs('C:/Users/Administrator/Desktop/抖音爬虫json/VIDEO/{}'.format(vieo_name)) -
print('{}文件夹创建成功,开始写入..'.format(vieo_name)) -
else: -
# 以二进制方式写入路径,要先创建路径 -
with open('C:/Users/Administrator/Desktop/抖音爬虫json/VIDEO/{}/{}.mp4'.format(vieo_name, vieo_name + '_' + str(numb)), -
'wb') as f: -
# 写入 -
f.write(videoMp4) -
# 下载提示 -
print('视频下载完成-{}'.format(vieo_name + '_' + str(numb)), '...共计第{}个视频'.format(count)) -
count += 1 -
numb += 1 -
print('下载完成!')