【问题标题】:How to use python aiohttp library to download multiple webpages?如何使用python aiohttp库下载多个网页?
【发布时间】:2017-12-21 23:44:41
【问题描述】:

我正在尝试从 leaderboard 异步抓取视频游戏的数据。每周和每天都有挑战。到目前为止,我的代码基于this async client with semaphores。不同之处在于我试图包含在函数中使用循环的结尾。这是我的代码的相关部分:

from urllib.parse import urljoin
import asyncio
import aiohttp

async def fetch(url, session):
    async with session.get(url) as response:
            return await response.read()


async def bound_fetch(url, session, sem):
    async with sem:
        await fetch(url, session)

async def fetch_pages(url,pages,session):
    tasks = []
    sem = asyncio.Semaphore(LIMIT)

    for page in range(pages+1):
        task_url = urljoin(url,str(page))
        task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
        tasks.append(task)

    await asyncio.gather(*tasks)

def leaderboard_crawler(date, entries=0, pages=1):
    website = "https://www.thronebutt.com/archive/"
    date_url = urljoin(website,date+"/")
    entries_per_page = 30
    number_of_entries = entries or pages * entries_per_page
    full_pages, last_page = divmod(number_of_entries,30)
    entry_list = [30 for x in range(full_pages)]
    if last_page != 0:
        entry_list.append(last_page)    

    loop = asyncio.get_event_loop()
    with aiohttp.ClientSession() as session:
        future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
        date_html = loop.run_until_complete(future)

    return date_html

def weekly_leaderboard(week, year, entries=0, pages=1):
    weekly_date = "{0:02d}{1}".format(week, year)
    return leaderboard_crawler(weekly_date,entries,pages)

def daily_leaderboard(day, month, year, entries=0, pages=1):
    daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
    return leaderboard_crawler(daily_date, entries, pages)

我认为问题出在 fetch_urls 函数中的asyncio.gather(*tasks) 部分。我不知道如何将其传递给leaderboard_crawler。现在date_html 是无。我试过return await asyncio.gather(*tasks),它返回一个无数组。我也尝试将它包装在asyncio.ensure_future 中,然后将其传递给loop.run_until_complete,但这似乎也不起作用。

【问题讨论】:

    标签: python asynchronous web-scraping aiohttp


    【解决方案1】:

    原因很简单,您的调用堆栈中缺少return

    async def bound_fetch(url, session, sem):
        async with sem:
            # await fetch(url, session)  # missing return
            return await fetch(url, session)  # this one is right
    
    async def fetch_pages(url,pages,session):
        tasks = []
        sem = asyncio.Semaphore(LIMIT)
    
        for page in range(pages+1):
            task_url = urljoin(url,str(page))
            task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
            tasks.append(task)
    
        # await asyncio.gather(*tasks)  # missing return
        return await asyncio.gather(*tasks)  # this one is right.
    

    工作示例在这里:

    from urllib.parse import urljoin
    import asyncio
    import aiohttp
    
    async def fetch(url, session):
        async with session.get(url) as response:
                return await response.read()
    
    
    async def bound_fetch(url, session, sem):
        async with sem:
            return await fetch(url, session)
    
    async def fetch_pages(url,pages,session):
        tasks = []
        sem = asyncio.Semaphore(5)
    
        for page in range(pages+1):
            task_url = urljoin(url,str(page))
            task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
            tasks.append(task)
    
        return await asyncio.gather(*tasks)
    
    def leaderboard_crawler(date, entries=0, pages=1):
        website = "https://www.thronebutt.com/archive/"
        date_url = urljoin(website,date+"/")
        entries_per_page = 30
        number_of_entries = entries or pages * entries_per_page
        full_pages, last_page = divmod(number_of_entries,30)
        entry_list = [30 for x in range(full_pages)]
        if last_page != 0:
            entry_list.append(last_page)    
    
        loop = asyncio.get_event_loop()
        with aiohttp.ClientSession() as session:
            future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
            date_html = loop.run_until_complete(future)
    
        return date_html
    
    def weekly_leaderboard(week, year, entries=0, pages=1):
        weekly_date = "{0:02d}{1}".format(week, year)
        return leaderboard_crawler(weekly_date,entries,pages)
    
    def daily_leaderboard(day, month, year, entries=0, pages=1):
        daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
        return leaderboard_crawler(daily_date, entries, pages)
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2017-04-09
      • 2017-08-11
      • 2023-03-28
      • 2021-08-03
      • 1970-01-01
      • 2017-05-12
      • 2010-12-10
      • 1970-01-01
      相关资源
      最近更新 更多