ims-

python下载网页视频

因网站不同需要修改。

下载 mp4 连接

from bs4 import BeautifulSoup
import requests
import urllib
import re
import json
encodestyle = \'gbk\'
homepage=\'http://www.**.html\'
htmlhead=\'http://www.**\'  # GetwVideoHtml() 函数用


#GetNPage_html(homepage,n)
#HtmlList2Mp4List(sumhtml)
#Writelist2json(listname,lists)

def GetwVideoHtml(furl):
    retlist=[]
    res = requests.get(furl)
    res.encoding= encodestyle
    soup = BeautifulSoup(res.text,\'html.parser\')
    for Tag_contentpage in soup.select(\'.video_box\'):   #<div class= video_box>
        for tag_a in Tag_contentpage.select(\'a\'):       #<a href = \'http-html\' target=\'_blank\'>
            httphtml=tag_a[\'href\']
            retlist.append(htmlhead+httphtml)  # use htmlhead
            #print(imgsrc)
    return retlist
def GetNPage_html(homepage,n):
    rethtml=[]
    for num in range(1,n+1):
        if num == 1:
            homewebpage=homepage
        else:
            homewebpage= homepage.rsplit(\'.\',1)[0] + \'_\'+ str(num) + \'.html\'
        print(homewebpage)
        htmllinks = GetwVideoHtml(homewebpage)
        rethtml = rethtml + htmllinks
    return rethtml

def GetMp4SrcFromHtml(url):
    headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
    file = urllib.request.urlopen(url).read()
    file = file.decode(\'gbk\')
    pattern = re.compile(r\'(https?://.*.mp4)\', re.I)  
    videolinks = pattern.findall(file)
    videolinks = list(set(videolinks))
    return videolinks
def HtmlList2Mp4List(sumhtml):
    retmp4s = []
    for html in sumhtml:
        mp4s = GetMp4SrcFromHtml(html)
        for mp4 in mp4s:
            retmp4s.append(mp4)
    return retmp4s
def Writelist2json(listname,lists):  
    length = str(len(lists)) 
    with open(\'D:/ipynb/commfile/\'+ listname + \'_len_\'+length +\'.json\', \'w\') as fw:
        json.dump(lists, fw)
        
sumhtml = GetNPage_html(homepage,3)
mp4list = HtmlList2Mp4List(sumhtml)
Writelist2json("mp4list",mp4list)

下载部分

from bs4 import BeautifulSoup
import requests
import urllib
import json
import threading
import datetime
import os

def mkdir(path):
    folder = os.path.exists(path)
    if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径
        print ("---  new folder...  ---")
        print ("---  OK  ---")
    else:
        print ("---  There is this folder!  ---")

def Schedule(a,b,c):
    \'\'\'
    回调函数:用于显示下载进度
    a:已经下载的数据块
    b:数据块的大小
    c:远程文件的大小
   \'\'\'
    per = 100.0 * a * b / c
    if (per > 100) :
        per = 100
    print (\'%.2f%%\' % per)
def createdownloadlink(name,url):
    urllib.request.urlretrieve(url,name)

class myThread (threading.Thread):
    def __init__(self, name, url):
        threading.Thread.__init__(self) # 线程初始化
        self.name = name  # 赋值成员变量
        self.url = url
    def run(self):
        print ("开始下载:" + self.name)
        urllib.request.urlretrieve(self.url,self.name)
        #createdownloadlink(self.name, self.url)  # 在线程中运行的函数
        print ("完成下载:" + self.name)
def DownMp4file(lists):
    dateASfolder=datetime.datetime.now().strftime(\'%m-%d\')
    foldername = \'D:/videos/\'+dateASfolder
    mkdir( foldername)    
    threadlist = [];#存放线程的数组,相当于线程池
    filenum=0
    for url in lists:  
        filename = foldername + \'/\'+ str(filenum)+ \'.mp4\'   
        filenum=filenum+1
        thread = myThread(filename, url)  # 创建线程对象
        threadlist.append(thread)        #这个线程放到线程threads
    return threadlist
# 执行部分


with open(\'D:/ipynb/commfile/srcmp4s_len_66.json\', \'r\') as fr:
    srcmp4s = json.load(fr)

print(len(srcmp4s))
srcmp4s[0]

threads= DownMp4file(srcmp4s)

for t in threads[:10]:#让线程池中的所有数组开始
    t.start(); 
for t in threads[:10]:
    t.join();#等待所有线程运行完毕才执行一下的代码

分类:

技术点:

相关文章: