【发布时间】:2019-04-02 14:56:15
【问题描述】:
我正在使用 aiohttp 和 limited_as_completed 方法来加快报废速度(大约 1 亿个静态网站页面)。但是,代码在几分钟后停止,并返回 TimeoutError。我尝试了几件事,但仍然无法阻止引发 asyncio.TimeoutError。请问如何才能忽略错误,继续?
我运行的代码是:
N=123
import html
from lxml import etree
import requests
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
import pandas as pd
import re
import csv
import time
from itertools import islice
import sys
from contextlib import suppress
start = time.time()
data = {}
data['name'] = []
filename = "C:\\Users\\xxxx"+ str(N) + ".csv"
def limited_as_completed(coros, limit):
futures = [
asyncio.ensure_future(c)
for c in islice(coros, 0, limit)
]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in futures:
if f.done():
futures.remove(f)
try:
newf = next(coros)
futures.append(
asyncio.ensure_future(newf))
except StopIteration as e:
pass
return f.result()
while len(futures) > 0:
yield first_to_finish()
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
with suppress(asyncio.TimeoutError):
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
limit = 1000
async def print_when_done(tasks):
for res in limited_as_completed(tasks, limit):
await res
url = "http://xxx.{}.html"
loop = asyncio.get_event_loop()
async def main():
connector = TCPConnector(limit=10)
async with ClientSession(connector=connector,headers=headers,raise_for_status=False) as session:
coros = (get_info_byid(i, url.format(i), session) for i in range(N,N+1000000))
await print_when_done(coros)
loop.run_until_complete(main())
loop.close()
print("took", time.time() - start, "seconds.")
错误日志是:
Traceback (most recent call last):
File "C:\Users\xxx.py", line 111, in <module>
loop.run_until_complete(main())
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\asyncio\base_events.py", line 573, in run_until_complete
return future.result()
File "C:\Users\xxx.py", line 109, in main
await print_when_done(coros)
File "C:\Users\xxx.py", line 98, in print_when_done
await res
File "C:\Users\xxx.py", line 60, in first_to_finish
return f.result()
File "C:\Users\xxx.py", line 65, in get_info_byid
async with session.get(url,timeout=20) as resp:
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 855, in __aenter__
self._resp = await self._coro
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 391, in _request
await resp.start(conn)
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client_reqrep.py", line 770, in start
self._continue = None
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\helpers.py", line 673, in __exit__
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError
我试过了 1)添加期望asyncio.TimeoutError:通过。不工作
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
try:
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
except asyncio.TimeoutError:
pass
2) 如上所示抑制(asyncio.TimeoutError)。不工作
我昨天刚刚学习了aiohttp,所以也许我的代码中还有其他错误导致仅在运行几分钟后才导致超时错误?如果有人知道如何处理,非常感谢!
【问题讨论】:
-
尝试将
async with session.get(url,timeout=20) as resp:包裹在try except中。 -
@YuriiKramarenko 非常感谢!它解决了一些错误,但一段时间后 asyncio 不断出现其他新错误,例如内存问题。我可能需要返回请求或使用批处理调用小范围的 py,以便我可以传递内存或随机错误
-
我认为最好的解决方案是为
session.get()编写简单的包装器,允许您在没有上下文管理器的情况下发出请求并将其与gather 一起使用。 -
在答案中添加小例子
标签: python exception-handling python-asyncio aiohttp timeoutexception