爬虫+多线程
有时候需要爬取的数据量比较大,我们可以使用多线程进行操作,下面直接上示例
利用线程池爬取梨视频的短视频
pool.map() 传递的是两个参数第一个参数是一个函数名(不带()),第二个参数是个迭代对象。
这里我们请求和存储都使用线程池,所以要将请求和保存都单独封装成函数,方便给pool.map()传参
import re import uuid import requests from lxml import etree # 导入线程池库 from multiprocessing.dummy import Pool def get_video(url): # 获取视频二进制流就可以了 return requests.get(url=url,headers=headers).content def save_video(video): filename = "./videos/"+str(uuid.uuid4())+".mp4" with open(filename,"wb") as f: f.write(video) # 实例化线程池对象,指定最大线程数 pool = Pool(5) # 连接为"社会"选项的 url = "https://www.pearvideo.com/category_1" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" } page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) # 获取每一个视频存在的li标签 li_list = tree.xpath("//div[@id=\'listvideoList\']/ul/li") # 为线程池准备可迭代对象,将获取到的video_url添加到列表 video_url_list = [] for li in li_list: # 在li标签获取视频详情页的url detail_url = "https://www.pearvideo.com/" + li.xpath("./div/a/@href")[0] # 获取视频详情页的源代码 detail_page_text = requests.get(url=detail_url,headers=headers).text # 在响应中查找视屏格式MP4或者MP3,然后用正则取到正确的视频连接 # sdUrl="",ldUrl="",srcUrl="https://video.pearvideo.com/mp4/short/20190228/cont-1523251-13634163-hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin" video_url = re.findall(\'srcUrl="(.*?)",vdoUrl=srcUrl\',detail_page_text,re.S)[0] video_url_list.append(video_url) print(video_url_list) # 获取video列表, video_list = pool.map(get_video,video_url_list) # 保存视频 pool.map(save_video,video_list)