一、单线程版
关于Mzitu的爬取应该来说是比较入门的了,因为并没涉及到太多的反爬机制,据目前来看主要有两点:
headers中Referer参数:其解决方法也很简单,只需要在请求头中加入这个参数就可以了,而且也不需要动态变化,固定为主页地址即可。
请求速度限制:在实际爬取过程中我们会发现,如果爬取速度过快IP往往会被封了,而这里我们只需要适当限制速度或者加入代理池即可。
具体的爬虫分析,网上随便一搜就是一堆,我这里就直接献上代码好了:
# ============================================================================= # Mzitu图片爬取 # ============================================================================= import re import os import time import queue import requests from tqdm import tqdm from termcolor import * from colorama import init # 解决CMD无法显示颜色问题 init(autoreset=False) class spider_Mzidu(): def __init__(self): # 定义请求地址 self.url_page = \'https://www.mzitu.com/page/%d/\' # 搜索套图页面(用以获取套图ID) self.url_taotu = \'https://www.mzitu.com/%s\' # 套图页面(用以获取图片地址) # 定义请求头 self.headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0\', \'Accept\': \'*/*\', \'Accept-Language\': \'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\', \'Accept-Encoding\': \'gzip, deflate, br\', \'X-Requested-With\': \'XMLHttpRequest\', \'Connection\': \'keep-alive\', \'Referer\': \'https://www.mzitu.com\', } # 定义正则表达式 self.p_id = \'<span><a href="https://www.mzitu.com/(\d*?)" target="_blank">(.*?)</a></span>\' self.p_imgurl = \'"main-image">.*?<img src="(.*?)"\' self.p_page = \'…</span>.*?<span>(\d*?)</span>\' # 存储变量 self.queue_id = queue.Queue() def getPages(self): # 获取总页数 res = requests.get(self.url_page%1,headers=self.headers) html = res.text N = re.findall(\'\'\'class="page-numbers dots">[\s\S]*?>(\d*?)</a>[\s\S]*?"next page-numbers"\'\'\',html)[0] return int(N) def getID(self): # 获取套图ID page_range = input(\'请输入爬取页数(如1-10):\') p_s = int(page_range.split(\'-\')[0]) p_e = int(page_range.split(\'-\')[1]) time.sleep(0.5) print(colored(\'开始获取套图ID\'.center(50,\'-\'),\'green\')) bar = tqdm(range(p_s,p_e+1),ncols=60) # 进度条 for p in bar: res = requests.get(self.url_page%p,headers=self.headers) html = res.text ids = re.findall(self.p_id,html) for i in ids: self.queue_id.put(i) bar.set_description(\'第%d页\'%p) def downloadImg(self,imgurl): # 下载图片 res = requests.get(imgurl,headers=self.headers) img = res.content return img def parseTaotu(self,taotuID): # 解析套图的"图片数量",以及"图片地址" res = requests.get(self.url_taotu%taotuID,headers=self.headers) html = res.text page = int(re.findall(self.p_page,html)[0]) imgurl = re.findall(self.p_imgurl,html)[0] imgurl = imgurl[:-6]+\'%s\'+imgurl[-4:] return(imgurl,page) def downloadTaotu(self): # 下载套图 while not self.queue_id.empty(): taotu = self.queue_id.get() taotuID = taotu[0] taotuName = taotu[1] try: imgurl,page = self.parseTaotu(taotuID) path = \'[P%d]\'%page+taotuName if not os.path.exists(path): os.mkdir(path) bar = tqdm(range(1,page+1),ncols=50) # 进度条 for i in bar: url = imgurl%(str(i).zfill(2)) img = self.downloadImg(url) with open(\'./%s/%d.jpg\'%(path,i),\'wb\') as f: f.write(img) print(\'套图("\'+colored(taotuName,\'red\')+\'")爬取完成\') except: time.sleep(3) self.queue_id.put(taotu) def run(self): # 主程序 os.system(\'cls\') # 清空控制台 print(\'*\'*35) print(\'*\'+\'欢迎使用Mzitu下载器\'.center(26)+\'*\') print(\'*\'*35) N = self.getPages() print((\'Mzitu当前共有%s页!\'%colored(N,\'red\')).center(30)) print(\'\n\') self.getID() print(\'\n\'+colored(\'开始爬取套图\'.center(50,\'-\'),\'green\')) self.downloadTaotu() spider = spider_Mzidu() spider.run()
二、多线程版
有小伙伴估计得问了:“单线程这么慢?您是在开玩笑的叭,等得我不得憋坏咯?”
客官这边请,来试试多线程版的好了:
# ============================================================================= # Mzitu图片爬取(多线程) # ============================================================================= import re import os import time import queue import requests import threading from tqdm import tqdm from termcolor import * from colorama import init # 解决CMD无法显示颜色问题 init(autoreset=False) # 代理(XXX代理) def Get_proxy(): res = requests.get(\'xxxxxxxxxxxxxxxxxxx\') html = res.text return html class spider_Mzidu(): def __init__(self): # 定义请求地址 self.url_page = \'https://www.mzitu.com/page/%d/\' # 搜索套图页面(用以获取套图ID) self.url_taotu = \'https://www.mzitu.com/%s\' # 套图页面(用以获取图片地址) # 定义请求头 self.headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0\', \'Accept\': \'*/*\', \'Accept-Language\': \'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\', \'Accept-Encoding\': \'gzip, deflate, br\', \'X-Requested-With\': \'XMLHttpRequest\', \'Connection\': \'keep-alive\', \'Referer\': \'https://www.mzitu.com\', } # 定义正则表达式 self.p_id = \'<span><a href="https://www.mzitu.com/(\d*?)" target="_blank">(.*?)</a></span>\' self.p_imgurl = \'"main-image">.*?<img src="(.*?)"\' self.p_page = \'…</span>.*?<span>(\d*?)</span>\' # 存储变量 self.queue_id = queue.Queue() #HTTP代理 proxy = Get_proxy() self.proxies = {\'http\':\'http://\'+proxy, \'https\':\'https://\'+proxy} def getPages(self): # 获取总页数 res = requests.get(self.url_page%1,headers=self.headers,proxies=self.proxies,timeout=10) html = res.text N = re.findall(\'\'\'class="page-numbers dots">[\s\S]*?>(\d*?)</a>[\s\S]*?"next page-numbers"\'\'\',html)[0] return int(N) def getID(self): # 获取套图ID page_range = input(\'请输入爬取页数(如1-10):\') p_s = int(page_range.split(\'-\')[0]) p_e = int(page_range.split(\'-\')[1]) time.sleep(0.5) print(colored(\'开始获取套图ID\'.center(50,\'-\'),\'green\')) bar = tqdm(range(p_s,p_e+1),ncols=60) # 进度条 for p in bar: res = requests.get(self.url_page%p,headers=self.headers,proxies=self.proxies,timeout=10) html = res.text ids = re.findall(self.p_id,html) for i in ids: self.queue_id.put(i) bar.set_description(\'第%d页\'%p) def downloadImg(self,imgurl,proxies): # 下载图片 res = requests.get(imgurl,headers=self.headers,proxies=proxies,timeout=10) img = res.content return img def parseTaotu(self,taotuID,proxies): # 解析套图的"图片数量",以及"图片地址" res = requests.get(self.url_taotu%taotuID,headers=self.headers,proxies=proxies,timeout=10) html = res.text page = int(re.findall(self.p_page,html)[0]) imgurl = re.findall(self.p_imgurl,html)[0] imgurl = imgurl[:-6]+\'%s\'+imgurl[-4:] return(imgurl,page) def downloadTaotu(self): # 下载套图 proxy = Get_proxy() proxies = {\'http\':\'http://\'+proxy, \'https\':\'https://\'+proxy} while not self.queue_id.empty(): taotu = self.queue_id.get() taotuID = taotu[0] taotuName = taotu[1] try: imgurl,page = self.parseTaotu(taotuID,proxies) path = \'[P%d]\'%page+taotuName if not os.path.exists(path): os.mkdir(path) bar = tqdm(range(1,page+1),ncols=50) # 进度条 for i in bar: url = imgurl%(str(i).zfill(2)) img = self.downloadImg(url,proxies) with open(\'./%s/%d.jpg\'%(path,i),\'wb\') as f: f.write(img) print(\'套图("\'+colored(taotuName,\'red\')+\'")爬取完成\') except: time.sleep(3) proxy = Get_proxy() proxies = {\'http\':\'http://\'+proxy, \'https\':\'https://\'+proxy} self.queue_id.put(taotu) def changeProxy(self): # 更换代理 proxy = Get_proxy() self.proxies = {\'http\':\'http://\'+proxy, \'https\':\'https://\'+proxy} def run(self): # 主程序 os.system(\'cls\') # 清空控制台 print(\'*\'*35) print(\'*\'+\'欢迎使用Mzitu下载器\'.center(26)+\'*\') print(\'*\'*35) N = self.getPages() print((\'Mzitu当前共有%s页!\'%colored(N,\'red\')).center(30)) print(\'\n\') self.getID() print(\'\n\'+colored(\'开始爬取套图\'.center(50,\'-\'),\'green\')) # 多线程下载套图 N_thread = 3 thread_list = [] for i in range(N_thread): thread_list.append(threading.Thread(target=self.downloadTaotu)) for t in thread_list: t.start() for t in thread_list: t.join() spider = spider_Mzidu() spider.run()
细心的大大应该发现了,其实多线程版跟单线程版结构上几乎没有太大的差别(这里也提供了一种代码思路,这样使得如果我们以后想把原来代码改为多线程,可以更加方便快捷),主要是这两点:
调用downloadTaotu()函数的时候,使用threading模块开启多线程多次调用。
加入了HTTP代理模块。这里大家可以酌情考虑是否保留,不过根据我测试发现,如果是使用多线程的话,建议大家还是加入代理,不然IP很可能被封。
————————————————
————————————————
版权声明:本文为CSDN博主「不正经的kimol君」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/kimol_justdo/article/details/105515579