twtter

一、用postman 捋顺爬取流程

1、进行身份验证 

第一次握手连接

python 爬取m3u8 to MP4 视频

第二次提交cookie信息,获得token信息

python 爬取m3u8 to MP4 视频

2、获取视频连接json数据

https://api.twitter.com/1.1/videos/tweet/config/1095304396186046466.json 

注意是 https://api.twitter.com/1.1/videos/tweet/config/+推文id.json

第一次握手

python 爬取m3u8 to MP4 视频

提交token信息获取视频连接,如果是mp4可直接下载,如果是m3u8需要下载解析获得ts文件并下载ts文件进行合并成mp4文件

python 爬取m3u8 to MP4 视频

3、下载并解析m3u8文件获取ts文件链接

https://video.twimg.com/ext_tw_video/1103397654887915522/pu/pl/1280x720/GZUYVlhkfV5DnxGW.m3u8

ts链接格式如下:

https://video.twimg.com//ext_tw_video/1109741602329657344/pu/vid/15000/18000/720x1280/aTUklwiSueHb35rU.ts

下载后以二进制方式合并即可

二、python 2 代码

#coding = utf-8
import requests
import json
import urllib2
import os
import sys
tsFileNameList = list()
#use to get token ,tcp first hand
def getToken1():
    header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br',
            'Access-Control-Request-Headers':'authorization,x-csrf-token',
            'Access-Control-Request-Method':'POST',
            'Origin':'https://twitter.com'
            }
    r=requests.options("https://api.twitter.com/1.1/guest/activate.json",headers=header)
    print(r)

#use to get token,tcp third hand
def getToken2():
     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br',
            'Access-Control-Request-Headers':'authorization,x-csrf-token',                                                       'Access-Control-Request-Method':'POST',                                                                              'Origin':'https://twitter.com',
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64)',
            'authorization':'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE',
            'cookie':'personalization_id=\"v1_pid1UUVchOmH31FJFT2ZLA==\"; guest_id=v1%3A153569557385510737',
            'content-length':'0'
            }
     r=requests.post("https://api.twitter.com/1.1/guest/activate.json",headers=header)
     jsonToken = r.json()
     tokenId = jsonToken["guest_token"].encode("utf-8")
     return tokenId

#get josn data. tcp first hand
def getJsondata1(twid):
    header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br',
            'Access-Control-Request-Headers':'authorization,x-csrf-token',
            'Access-Control-Request-Method':'POST',
            'Origin':'https://twitter.com'
            }
    jsonurl = "https://api.twitter.com/1.1/videos/tweet/config/"+twid+".json"
    r=requests.options(jsonurl,headers=header)
    print(r)

#get json data .tcp third hand 
def getJsondata2(twid):
     tokenid = getToken2()
     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br',
            'authorization':'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE',
            'cookie':'personalization_id=\"v1_pid1UUVchOmH31FJFT2ZLA==\"; guest_id=v1%3A153569557385510737',
            'content-length':'0',
            'x-guest-token':tokenid
            }
     jsonurl = "https://api.twitter.com/1.1/videos/tweet/config/"+twid+".json"
     r=requests.get(jsonurl,headers=header)
     if "[200]" not in r:
         print("dont have video file")
         return 
     jsondata = r.json()
     print(jsondata)
     m3u8url = jsondata["track"]["playbackUrl"].encode("utf-8")
     print(m3u8url)
     return m3u8url

#dwonload mp4file
def downloadmp4(mp4url,twid):
    
     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br'
            }
#
     req = urllib2.Request(mp4url,None,headers = header)
     response = urllib2.urlopen(req)
     
     path = "./"+twid+".mp4"
     print(path)
     with open(path,'wb') as output:
        while True: 
            buffe = response.read(1024*256);
            if not buffe:
                break
            # received += len(buffer)
            output.write(buffe)
     print("download ok")
    
#download ts file
def downloadTsFile(tsurl):
     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br'
            }
     
     req = urllib2.Request(tsurl,None,headers = header)
     response = urllib2.urlopen(req)
     #ts filename 
     name = tsurl.split("/")[10]
     print(name)
     path ="./"+name
     with open(path,'wb') as output:
        while True: 
            buffer = response.read(1024*256);
            if not buffer:
                break
            # received += len(buffer)
            output.write(buffer)

#get ts url
def gettsurl(tslisturl):
     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br'
            }
     r=requests.get(tslisturl,headers=header)
     tslistdata = r.text.split("\n")
     #deal first m3u8data 
     for i,val in enumerate(tslistdata):
         value=val.replace(" ","")
         if "#" in value:
             continue;
         if ".ts" in value:
             tsurl = "https://video.twimg.com"+value
             name = tsurl.split("/")[10]
             tsFileNameList.append(name.encode("utf-8"))
             print(tsFileNameList)
             downloadTsFile(tsurl)
        

#get vedio url if mp4 download ,if m3u8 get ts list 
def getm3u8list(twid):
    
     m3u8listuri = getJsondata2(twid)
     if  m3u8listuri == None:
         print("dont have m3u8 url")
         return 
     m3u8listurl =m3u8listuri.split("?")[0]
     
     #not deal Case write
     if ".mp4" in  m3u8listurl:
        downloadmp4(m3u8listurl,twid) 
        return 
         

     header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip,deflate,br'
            }
     r=requests.get(m3u8listurl,headers=header)
     m3u8data = r.text.split("\n")
     #deal first m3u8data 
     
     for i,val in enumerate(m3u8data):
         
         #get 1280x720 url
         value=val.replace(" ","")
         if "#" in value:
             continue;
         if ".m3u8" in value:
             tsurl="https://video.twimg.com"+value
             gettsurl(tsurl)
             break;
     ts2mp4(twid)

#comblie ts to mp4
def ts2mp4(twid):

    path ="./"+twid+".mp4"
    fmp4=open(path,'wb')
    for i,val in enumerate(tsFileNameList):
        tspath="./"+val
        with open(tspath,'rb') as tsread:
            while True: 
                buffer = tsread.read(1024*256);
                if not buffer:
                    break
                # received += len(buffer)
                fmp4.write(buffer)
            os.remove(tspath)
    fmp4.close()
    print("download ok")

def main():
    #twid https://twitter.com/0xDUDE/status/109530439618604646
    twid='1087101124824498180'
    getToken1()
    getJsondata1(twid)
    getm3u8list(twid)

main()

 

相关文章: