asyncio/aiohttp 不返回响应答案

【问题标题】：asyncio/aiohttp not returning responseasyncio/aiohttp 不返回响应
【发布时间】：2019-08-06 17:47:10
【问题描述】：

我正在尝试通过使用 asyncio/aiohttp 并行化 Web 请求来从 https://www.officialcharts.com/ 中抓取一些数据。我实现了链接here 中给出的代码。

我遵循了两个不同的程序。第一个是这样的。

from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json

import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler

import spotipy
import spotipy.util as util
import pandas as pd
from  more_itertools import unique_everseen

weeks = []
d = date(1970, 1, 1) 
d += timedelta(days = 6 - d.weekday())

for i in range(2500):    
    weeks.append(d.strftime('%Y%m%d'))
    d += timedelta(days = 7)

import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()

result = []
async def fetch(url, session):
    async with session.get(url) as response:
        return await response.read()

async def run(r):  
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    async with ClientSession() as session:
        for i in range(r):
            url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
            task = asyncio.ensure_future(fetch(url, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        result.append(responses)


loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)

print('Done')
print(result[0][0] == None)

上述代码的问题是，当我同时发出超过 1000 个请求时，它会失败。

post 的作者实施了一个不同的程序来解决这个问题，他声称我们可以处理多达 10K 的请求。我遵循了他的第二个程序，这是我的代码。

import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()

result = []
async def fetch(url, session):
    async with session.get(url) as response:
        delay = response.headers.get("DELAY")
        date = response.headers.get("DATE")
        print("{}:{} with delay {}".format(date, response.url, delay))
        return await response.read()


async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        await fetch(url, session)


async def run(r):
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(1000)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in range(r):         
            url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
            task = asyncio.ensure_future(bound_fetch(sem, url, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        result.append(responses)

number = 5

loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)

print('Done')
print(result[0][0] == None)

由于某种原因，这不会返回任何响应。

PS：我不是CS背景，只是为了好玩而编程。我不知道异步代码中发生了什么。

【问题讨论】：

标签： python web-scraping python-requests aiohttp

【解决方案1】：

尝试使用最新版本。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads

limit = 10
http_ok = [200]


async def scrape(url_list):
    tasks = list()
    sem = Semaphore(limit)

    async with ClientSession() as session:
        for url in url_list:
            task = ensure_future(scrape_bounded(url, sem, session))
            tasks.append(task)

        result = await gather(*tasks)

    return result


async def scrape_bounded(url, sem, session):
    async with sem:
        return await scrape_one(url, session)


async def scrape_one(url, session):
    try:
        async with session.get(url) as response:
            content = await response.read()
    except client_exceptions.ClientConnectorError:
        print('Scraping %s failed due to the connection problem', url)
        return False

    if response.status not in http_ok:
        print('Scraping%s failed due to the return code %s', url, response.status)
        return False

    content = loads(content.decode('UTF-8'))

    return content

if __name__ == '__main__':
    urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
    res = run(scrape(urls))

    print(dumps(res, indent=4))

这是一个真实的project 的模板，按预期工作。

你可以找到这个源代码here

【讨论】：

我在 Jupyter-Lab 中收到错误“无法从正在运行的事件循环调用 asyncio.run()”。但它适用于python shell。谢谢！
你应该仔细检查python版本，它应该是3.7.2+，因为自上一版以来asyncio发生了变化
@Dimitrii，是否可以使用 asyncio 和 wget 下载多个文件？我找到了一个答案 here 但我想用 wget 修改你的答案
小心，非异步应用程序可能会导致异常情况，因此最好使用本机异步可能性，例如 aiohttp.get。特别是如果您使用框架，例如 Flask 或 Django。无论如何，如果它有效，您可以使用替代解决方案更新示例。无论如何，你为什么需要 wget？
我是个菜鸟。我发布了一个问题here