1.asyncio模块
@asyncio.coroutine def task(): print(\'start...\') yield from asyncio.sleep(5) #不支持HTTP请求,支持TCP请求 #但HTTP请求就是基于TCP封装的,所以我们可以基于TCP协议发送 #HTTP请求 print(\'end\') tasks=[task(),task(),task()] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
import asyncio def task(host,url=\'/\'): reader,writer=yield from asyncio.open_connection(host,80) request_header_content="GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" %(url,host) request_header_content=bytes(request_header_content,encoding=\'utf-8\') writer.write(request_header_content) yield from writer.drain() text=yield from reader.read() print(host,url,text) tasks=[task(\'http://www.baidu.com\'),task(\'http://www.cnblogs.com\')] loop=asyncio.get_event_loop() loop. run_until_complete(asyncio.gather(*tasks)) loop.close()
import asyncio import requests @asyncio.coroutine def task(fun,*args): print(fun,args) loop=asyncio.get_event_loop() future=loop.run_in_executor(None,fun,*args) response=yield from future print(response.url,response.content) tasks=[ task(requests.get,\'http://bing.com\'), task(requests.get,\'http://cnblogs.com\') ] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
#pip install aiohttp #aiohttp + asyncio import asyncio import aiohttp @asyncio.coroutine def task(url): response=yield from aiohttp.request("GET",url) print(response) tasks=[task(\'http://bing.com\'),task(\'http://cnblogs.com\')] loop=asyncio.get_event_loop() result=loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
2.tornado模块
from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop COUNT=None count=0 def handle_response(response): global count count+=1 if response.error: print(\'error\') else: body=response.body print(body) global COUNT if count==COUNT: ioloop.IOLoop.instance().stop() def fun(): url_list=[\'http://www.baidu.com\',\'http://www.cnblogs.com\'] global COUNT COUNT=len(url_list) for url in url_list: http_client=AsyncHTTPClient() http_client.fetch(HTTPRequest(url),handle_response) ioloop.IOLoop.current().add_callback(fun) ioloop.IOLoop.current().start() #死循环
3.Twisted模块
#爬虫异步,提高并发 from twisted.web.client import getPage from twisted.internet import reactor,defer def one_done(args): print(args) print(type(args)) def all_done(args): print(args) print(type(args)) reactor.stop() @defer.inlineCallbacks def tasks(url): res=getPage(bytes(url,\'utf-8\')) res.addCallback(one_done) yield res url_list=[\'http://www.baidu.com\',\'http://www.cnblogs.com\'] def_list=[] for i in url_list: v=tasks(i) def_list.append(v) d=defer.DeferredList(def_list) d.addBoth(all_done) reactor.run() #死循环
4.gevent模块
#pip install greenlet #协程的模块+异步IO #pip install gevent #依赖greenlet模块 import gevent import requests from gevent.pool import Pool #协程池 from gevent import monkey monkey.patch_all() #封装成异步IO pool=Pool(3) #限制发送的个数 def task(method,url,req_kwargs): print(method,url,req_kwargs) response=requests.request(method,url,**req_kwargs) print(response.url) print(response.content) # gevent.joinall([ # gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}), # gevent.spawn(task,method="GET",url=\'http://bing.com\',req_kwargs={}), # ]) gevent.joinall([ pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}), pool.spawn(task,method="GET",url=\'http://bing.com\',req_kwargs={}), ])
#gevent+requests import grequests requests_list=[ grequests.get(\'http://cnblogs.com\'), grequests.get(\'http://bing.com\'), grequests.get(\'http://che.com\') ] response_list=grequests.map(requests_list) print(response_list)
总结:
自定义异步爬虫IO时:
#gevent->Twised->Tornado->asyncio