[小爬虫]——某网站视频爬虫
-
技术路线:requests + re
-
关于exe下载:可能涉及到侵权
-
源代码:下面
-
爬取思路:在html中找出加载资源的js文件,截取出一段结尾为.m3u8的乱码字符,经过16进制解码后得到一串有效的m3u8链接,爬取此m3u8文件并在此文件中找到新的.m3u8链接,再次转码后下载,里面存储有此视频的.ts文件,将.ts爬下来并合成即可
-
效果:10s内完成400M的下载
-
总结:去看了许多人的blog,许多网站存储播放视频都是采取一样的策略,即[ 两层.m3u8 + 无加密.ts文件 ]
-
代码:
import requests, re, os, threadpool
root = r\'C:\Users\Administrator\PycharmProjects\freeTest\video\' + \'\\\'
kv = {\'User-Agent\': \'Mozilla/5.0\'}
mp4sNum, tim = 0, 0
ls = []
wangzhi = \'\'
def getHtml(url):
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(\'Html Error.\')
def tran2(t):
if t == \'A\': return 10
elif t == \'B\': return 11
elif t == \'C\': return 12
elif t == \'D\': return 13
elif t == \'E\': return 14
elif t == \'F\': return 15
else: return int(t)
def tran1(s):
res = \'\'
for i in range(0, len(s)):
if s[i] == \'%\':
res += chr(int(s[i + 1]) * 16 + tran2(s[i + 2]))
elif s[i - 1] == \'%\' or s[i - 2] == \'%\': pass
else: res += s[i]
return res
def makeList(n, mp4s):
for i in range(0, n + 10): ls.append([])
size = (len(mp4s) // n) + 1
count = 0
block = 1
for i in mp4s:
count += 1
if count > block * size:
block += 1
ls[block].append(i)
else:
ls[block].append(i)
def fastNB(tvs):
if not len(tvs): return
global tim, mp4sNum
for i in tvs:
tim += 1
name = i.split(\'/\')[-1].split(\'.\')[0]
path = root + str(name) + \'.ts\'
if not os.path.exists(path):
tag = 1
while tag:
try:
r = requests.get(wangzhi + i, headers=kv)
with open(path, \'wb\') as f:
f.write(r.content)
print(\'\r进度为:{:.2%}\'.format(tim / mp4sNum), end=\'\')
tag = 0
except:
print(\'\r进度为:{:.2%}\'.format(tim / mp4sNum), end=\'\')
def download(text):
global mp4sNum
mp4s = re.findall(pattern=r\'/\d*?/\w*?/.*?/.*?/.*?\.ts\', string=text)
mp4sNum = len(mp4s)
print(\'共有\', mp4sNum, \'个ts文件\')
n = int(input(\'输入创建的线程数量:\'))
makeList(n, mp4s)
pool = threadpool.ThreadPool(n)
task = threadpool.makeRequests(fastNB, ls)
[pool.putRequest(req) for req in task]
pool.wait()
print(\'\n正在重新排序...\')
cnt = 0
for i in mp4s:
cnt += 1
num = str(cnt)
if len(num) == 1: num = \'0000\' + num
elif len(num) == 2: num = \'000\' + num
elif len(num) == 3: num = \'00\' + num
elif len(num) == 4: num = \'0\' + num
name = i.split(\'/\')[-1].split(\'.\')[0]
pathOld = root + name + \'.ts\'
pathNew = root + num + \'.ts\'
os.rename(pathOld, pathNew)
print(\'正在拼装视频...\')
cnt = 0
f = open(root + \'video.ts\', \'wb+\')
for i in mp4s:
cnt += 1
name = str(cnt)
if len(name) == 1: name = \'0000\' + name
elif len(name) == 2: name = \'000\' + name
elif len(name) == 3: name = \'00\' + name
elif len(name) == 4: name = \'0\' + name
path = root + str(name) + \'.ts\'
for line in open(path, \'rb\'):
f.write(line)
f.flush()
os.remove(path)
f.close()
def play(text):
js = re.findall(pattern=r\'/upload/playdata/.*?\.js\', string=text)
tmp = \'http://www.xxx.com\' + js[0]
text = getHtml(tmp)
tmp = re.findall(pattern=r\'\%u5728\%u7ebf\%u64ad\%u653e.*\%24\%u5728\%u7ebf\%u64ad\%u653e\', string=text) # 提取“在线播放”
if not len(tmp):
tmp = re.findall(pattern=\'http.*?\.m3u8\', string=text)
else:
text = text.replace(tmp[0], \'\')
tmp = re.findall(pattern=\'http.*?\.m3u8\', string=text)
urlFake = \'http\' + tran1(tmp[0].replace(\'https\', \'\'))
name = \'\'
playerName = [\'这里存各种网站使用的主用/备用播放器\']
for i in playerName:
if urlFake.find(i) != -1:
name = i
break
text = getHtml(urlFake)
tmp = re.findall(pattern=r\'/.*?m3u8\', string=text)
url = \'http://\' + name + \'.com/\' + tmp[0]
global wangzhi
wangzhi = \'http://\' + name + \'.com/\'
download(getHtml(url))
def main():
print(\'下载位置在同目录的video下\')
url = input(\'Please input a link:\')
play(getHtml(\'http://www.xxx.com/\' + url))
if __name__ == \'__main__\': main()