爬虫高并发之异步IO

1.asyncio模块

@asyncio.coroutine
def task():
    print(\'start...\')
    yield from asyncio.sleep(5) #不支持HTTP请求，支持TCP请求
    #但HTTP请求就是基于TCP封装的，所以我们可以基于TCP协议发送
    #HTTP请求
    print(\'end\')

tasks=[task(),task(),task()]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

基本用法

import asyncio
def task(host,url=\'/\'):

    reader,writer=yield from asyncio.open_connection(host,80)

    request_header_content="GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" %(url,host)
    request_header_content=bytes(request_header_content,encoding=\'utf-8\')

    writer.write(request_header_content)
    yield from writer.drain()
    text=yield from reader.read()
    print(host,url,text)

tasks=[task(\'http://www.baidu.com\'),task(\'http://www.cnblogs.com\')]


loop=asyncio.get_event_loop()
loop. run_until_complete(asyncio.gather(*tasks))
loop.close()

基于TCP发送HTTP请求

import asyncio
import requests

@asyncio.coroutine
def task(fun,*args):
    print(fun,args)
    loop=asyncio.get_event_loop()
    future=loop.run_in_executor(None,fun,*args)
    response=yield from future
    print(response.url,response.content)


tasks=[
    task(requests.get,\'http://bing.com\'),
    task(requests.get,\'http://cnblogs.com\')
    ]
loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio+requests

#pip install aiohttp
#aiohttp + asyncio
import asyncio
import aiohttp

@asyncio.coroutine
def task(url):
    response=yield from aiohttp.request("GET",url)
    print(response)

tasks=[task(\'http://bing.com\'),task(\'http://cnblogs.com\')]

loop=asyncio.get_event_loop()
result=loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio+aiohttp

2.tornado模块

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop
COUNT=None
count=0


def handle_response(response):
    global count
    count+=1
    if response.error:
        print(\'error\')
    else:
        body=response.body
        print(body)
    global COUNT
    if count==COUNT:
        ioloop.IOLoop.instance().stop()


def fun():
    url_list=[\'http://www.baidu.com\',\'http://www.cnblogs.com\']
    global COUNT
    COUNT=len(url_list)
    for url in url_list:
        http_client=AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url),handle_response)


ioloop.IOLoop.current().add_callback(fun)
ioloop.IOLoop.current().start()  #死循环

tornado异步IO

3.Twisted模块

#爬虫异步，提高并发
from twisted.web.client import getPage
from twisted.internet import reactor,defer


def one_done(args):
    print(args)
    print(type(args))


def all_done(args):
    print(args)
    print(type(args))
    reactor.stop()


@defer.inlineCallbacks
def tasks(url):
    res=getPage(bytes(url,\'utf-8\'))
    res.addCallback(one_done)
    yield res

url_list=[\'http://www.baidu.com\',\'http://www.cnblogs.com\']
def_list=[]
for i in url_list:
    v=tasks(i)
    def_list.append(v)

d=defer.DeferredList(def_list)
d.addBoth(all_done)
reactor.run()  #死循环

twisted异步IO

4.gevent模块

#pip install greenlet  #协程的模块+异步IO
#pip install gevent    #依赖greenlet模块

import gevent
import requests
from gevent.pool import Pool  #协程池
from gevent import monkey

monkey.patch_all() #封装成异步IO

pool=Pool(3) #限制发送的个数

def task(method,url,req_kwargs):
    print(method,url,req_kwargs)
    response=requests.request(method,url,**req_kwargs)
    print(response.url)
    print(response.content)


# gevent.joinall([
#     gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
#     gevent.spawn(task,method="GET",url=\'http://bing.com\',req_kwargs={}),
# ])

gevent.joinall([
    pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
    pool.spawn(task,method="GET",url=\'http://bing.com\',req_kwargs={}),
])

gevent+requests

#gevent+requests
import grequests

requests_list=[

    grequests.get(\'http://cnblogs.com\'),
    grequests.get(\'http://bing.com\'),
    grequests.get(\'http://che.com\')
]

response_list=grequests.map(requests_list)
print(response_list)

grequests

总结：

自定义异步爬虫IO时：

  #gevent->Twised->Tornado->asyncio