twtter
一、用postman 捋顺爬取流程
1、进行身份验证
第一次握手连接
第二次提交cookie信息,获得token信息
2、获取视频连接json数据
https://api.twitter.com/1.1/videos/tweet/config/1095304396186046466.json
注意是 https://api.twitter.com/1.1/videos/tweet/config/+推文id.json
第一次握手
提交token信息获取视频连接,如果是mp4可直接下载,如果是m3u8需要下载解析获得ts文件并下载ts文件进行合并成mp4文件
3、下载并解析m3u8文件获取ts文件链接
https://video.twimg.com/ext_tw_video/1103397654887915522/pu/pl/1280x720/GZUYVlhkfV5DnxGW.m3u8
ts链接格式如下:
下载后以二进制方式合并即可
二、python 2 代码
#coding = utf-8
import requests
import json
import urllib2
import os
import sys
tsFileNameList = list()
#use to get token ,tcp first hand
def getToken1():
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br',
'Access-Control-Request-Headers':'authorization,x-csrf-token',
'Access-Control-Request-Method':'POST',
'Origin':'https://twitter.com'
}
r=requests.options("https://api.twitter.com/1.1/guest/activate.json",headers=header)
print(r)
#use to get token,tcp third hand
def getToken2():
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br',
'Access-Control-Request-Headers':'authorization,x-csrf-token', 'Access-Control-Request-Method':'POST', 'Origin':'https://twitter.com',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64)',
'authorization':'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE',
'cookie':'personalization_id=\"v1_pid1UUVchOmH31FJFT2ZLA==\"; guest_id=v1%3A153569557385510737',
'content-length':'0'
}
r=requests.post("https://api.twitter.com/1.1/guest/activate.json",headers=header)
jsonToken = r.json()
tokenId = jsonToken["guest_token"].encode("utf-8")
return tokenId
#get josn data. tcp first hand
def getJsondata1(twid):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br',
'Access-Control-Request-Headers':'authorization,x-csrf-token',
'Access-Control-Request-Method':'POST',
'Origin':'https://twitter.com'
}
jsonurl = "https://api.twitter.com/1.1/videos/tweet/config/"+twid+".json"
r=requests.options(jsonurl,headers=header)
print(r)
#get json data .tcp third hand
def getJsondata2(twid):
tokenid = getToken2()
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br',
'authorization':'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE',
'cookie':'personalization_id=\"v1_pid1UUVchOmH31FJFT2ZLA==\"; guest_id=v1%3A153569557385510737',
'content-length':'0',
'x-guest-token':tokenid
}
jsonurl = "https://api.twitter.com/1.1/videos/tweet/config/"+twid+".json"
r=requests.get(jsonurl,headers=header)
if "[200]" not in r:
print("dont have video file")
return
jsondata = r.json()
print(jsondata)
m3u8url = jsondata["track"]["playbackUrl"].encode("utf-8")
print(m3u8url)
return m3u8url
#dwonload mp4file
def downloadmp4(mp4url,twid):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br'
}
#
req = urllib2.Request(mp4url,None,headers = header)
response = urllib2.urlopen(req)
path = "./"+twid+".mp4"
print(path)
with open(path,'wb') as output:
while True:
buffe = response.read(1024*256);
if not buffe:
break
# received += len(buffer)
output.write(buffe)
print("download ok")
#download ts file
def downloadTsFile(tsurl):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br'
}
req = urllib2.Request(tsurl,None,headers = header)
response = urllib2.urlopen(req)
#ts filename
name = tsurl.split("/")[10]
print(name)
path ="./"+name
with open(path,'wb') as output:
while True:
buffer = response.read(1024*256);
if not buffer:
break
# received += len(buffer)
output.write(buffer)
#get ts url
def gettsurl(tslisturl):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br'
}
r=requests.get(tslisturl,headers=header)
tslistdata = r.text.split("\n")
#deal first m3u8data
for i,val in enumerate(tslistdata):
value=val.replace(" ","")
if "#" in value:
continue;
if ".ts" in value:
tsurl = "https://video.twimg.com"+value
name = tsurl.split("/")[10]
tsFileNameList.append(name.encode("utf-8"))
print(tsFileNameList)
downloadTsFile(tsurl)
#get vedio url if mp4 download ,if m3u8 get ts list
def getm3u8list(twid):
m3u8listuri = getJsondata2(twid)
if m3u8listuri == None:
print("dont have m3u8 url")
return
m3u8listurl =m3u8listuri.split("?")[0]
#not deal Case write
if ".mp4" in m3u8listurl:
downloadmp4(m3u8listurl,twid)
return
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br'
}
r=requests.get(m3u8listurl,headers=header)
m3u8data = r.text.split("\n")
#deal first m3u8data
for i,val in enumerate(m3u8data):
#get 1280x720 url
value=val.replace(" ","")
if "#" in value:
continue;
if ".m3u8" in value:
tsurl="https://video.twimg.com"+value
gettsurl(tsurl)
break;
ts2mp4(twid)
#comblie ts to mp4
def ts2mp4(twid):
path ="./"+twid+".mp4"
fmp4=open(path,'wb')
for i,val in enumerate(tsFileNameList):
tspath="./"+val
with open(tspath,'rb') as tsread:
while True:
buffer = tsread.read(1024*256);
if not buffer:
break
# received += len(buffer)
fmp4.write(buffer)
os.remove(tspath)
fmp4.close()
print("download ok")
def main():
#twid https://twitter.com/0xDUDE/status/109530439618604646
twid='1087101124824498180'
getToken1()
getJsondata1(twid)
getm3u8list(twid)
main()