python爬虫脚本下载YouTube视频

爬虫

python

YouTube视频

工作环境：

python 2.7.13
pip
lxml, 安装 pip install lxml,主要用xpath查找节点，可以使用re模块代替
pytube, 安装 pip install pytube
科学上网工具

参考:

源码：

# coding: utf-8 
__author__ = "zwzhou" 
__date__ = "2017-03-19" 
 

import urllib2 

from pytube import YouTube 

from pprint import pprint 

from lxml import etree 

import sys,getopt 
 

def getHtml(url): 
    user_agent=\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1284.0 Safari/537.13\' 
    headers={\'User-Agent\':user_agent} 
    request=urllib2.Request(url,headers=headers) 
    response=urllib2.urlopen(request) 
    html=response.read() 
    return html 
     

def getUrl(html): 
    global savepath 
    global maxNumber 
    global timeThreshold 
    global cur_count 
    global videoLists 
    tree=etree.HTML(html) 
    urllist=tree.xpath(u\'//div[@class="thumb-wrapper"]/a/@href\') 
    #print urllist 
    urllist_time=tree.xpath(u\'//div[@class="thumb-wrapper"]/a/span/span/text()\') 
 
    baseurl=r\'https://www.youtube.com\' 
    for (item_name,item_length) in zip(urllist,urllist_time): 
        #print item_name 
        #print item_length 
        try: 
            yt = YouTube(baseurl+item_name) 
        except: 
            print "Some thing wrong about the authority" 
             
        print("video name:"+yt.filename) 
        print("video time:"+item_length) 
        if yt.filename in videoLists: # 文件已经存在 
            print "This video has been downloaded!" 
        else: 
            if checktime(item_length): 
                video = yt.filter(\'mp4\')[-1] 
                print("Now is loading %s------------>"%yt.filename) 
                video.download(savepath) 
                print("--------------->%sVideo is loaded!"%yt.filename) 
                cur_count+=1 
                videoLists.append(yt.filename) 
                if cur_count >= maxNumber:# 达到要求 
                    print(\'There are %d videos downloaded!This task is completed!\'%maxNumber) 
                    # TODO: if necessary, the videoLists can be logged 
                    sys.exit()       
            else: 
                print \'This video is too long and it will not be downloaded, just be ignored!\' 
    if urllist: 
        getUrl(baseurl+urllist[0]) #下一个页面 
 
 

def checktime(timelength): 
    global timeThreshold 
    strs=timelength.split(\':\') 
    time =int(strs[0])*60+int(strs[1]) 
    if time< timeThreshold: 
        return True 
    else: 
        return False 
 

def usage(): 
    print \'\'\' 

    usage: python dl_youtube [option] [arg] 

    options and args: 

    -s      : download path 

    -t      : time threshold of the video to be loaded, in seconds 

    -u      : start url which to be crawled, it can be set more than one time 

    -n      : when downloading is stop, i.e. how many videos will be downloaded, default is 10000. 

    -h      : print this help message 

    \'\'\' 
 

if __name__ == "__main__": 
    start_urls=[\'https://www.youtube.com/watch?v=TThzH_sJo6o\'] 
    videoLists=[] # 保存文件名，防止重复下载 
    # 初始值 
    savepath=r"D://MyDownloads" 
    maxNumber=10000 
    timeThreshold=240 
    cur_count=0 
     
    opts,args=getopt.getopt(sys.argv[1:],\'hs:t:n:u:\') 
    for op,value in opts: 
        if op == "-s":  # 下载路径，如默认 D://MyDownloads 
            savepath=value 
        elif op == \'-t\': # 时常限制，默认240s 
            timeThreshold =int(value) 
        elif op == "-h": # help 
            usage() 
            sys.exit() 
        elif op == \'-n\': 
            maxNumber=int(value) 
        elif op == \'-u\': # 初始的搜索链接 
            start_urls.append(value) 
 
    for item in start_urls: 
        html = getHtml(item) 
        getUrl(html) 
         

使用

python dl_youtube.py -n 10 -s D://MyDownloads -t 600 -u https://www.youtube.com/watch?v=TThzH_sJo6o 

将从页面 https://www.youtube.com/watch?v=TThzH_sJo6o 开始搜索下载10段时长小于6分钟的video保存到D://MyDownloads文件夹中。