1、写在前面
之前一篇随笔记录了异步的个人理解 https://www.cnblogs.com/rainbow-tan/p/15081118.html
之前随笔异步都是以asyncio.sleep()来进行异步操作的演示,下面代码具体演示了一次异步爬虫
2、使用的异步爬虫库为 aiohttp
演示功能:
爬取 https://wall.alphacoders.com/ 中的小图片,进行批量下载,进行下载用时的对比
(1)先用一般的requests库进行爬虫演示,查看运行的时间
import os import time import requests from bs4 import BeautifulSoup def get_html(url): ret = requests.get(url) return ret if __name__ == \'__main__\': index = 0 start = time.time() response = get_html(\'https://wall.alphacoders.com/\') soup = BeautifulSoup(response.text, \'lxml\') boxgrids = soup.find_all(class_=\'boxgrid\') for boxgrid in boxgrids: img = boxgrid.find(\'a\').find(\'picture\').find(\'img\') link = img.attrs[\'src\'] content = get_html(link).content picture_type = str(link).split(\'.\')[-1] index += 1 path = os.path.abspath(\'imgs\') if not os.path.exists(path): os.makedirs(path) with open(\'{}/{}.{}\'.format(path, index, picture_type), \'wb\') as f: f.write(content) end = time.time() print(f\'下载完成{index}个图片,用时:{end - start}秒\')
运行
用时14秒,下载30个图片
(2)使用异步库aiohttp下载
import asyncio import os import time import aiohttp from aiohttp import TCPConnector from bs4 import BeautifulSoup async def get_html(url): async with aiohttp.ClientSession( connector=TCPConnector(verify_ssl=False)) as session: async with session.get(url) as resp: text = await resp.text() soup = BeautifulSoup(text, \'lxml\') boxgrids = soup.find_all(class_=\'boxgrid\') links = [] for boxgrid in boxgrids: img = boxgrid.find(\'a\').find(\'picture\').find(\'img\') link = img.attrs[\'src\'] links.append(link) return links async def write_file(url, index): async with aiohttp.ClientSession( connector=TCPConnector(verify_ssl=False)) as session: async with session.get(url) as resp: text = await resp.read() path = os.path.abspath(\'images\') if not os.path.exists(path): os.makedirs(path) with open(f\'{path}/{index}.{str(url).split(".")[-1]}\', \'wb\') as f: f.write(text) if __name__ == \'__main__\': index = 0 start = time.time() loop = asyncio.get_event_loop() task = loop.create_task(get_html(\'https://wall.alphacoders.com/\')) links = loop.run_until_complete(task) tasks = [] for link in links: tasks.append(write_file(link, index)) index += 1 loop.run_until_complete(asyncio.gather(*tasks)) end = time.time() print(f\'下载完成{index}个图片,用时:{end - start}秒\')
运行
下载30个图片,用时4秒
学习链接 :
https://www.jianshu.com/p/20ca9daba85f
https://docs.aiohttp.org/en/stable/client_quickstart.html
https://juejin.cn/post/6857140761926828039 (这个未参考,但是看起来也很牛,收藏一下)