【python】王者荣耀全英雄头像爬虫（多线程）

成果展示

在这里插入图片描述

目标网址

https://pvp.qq.com/web201605/herolist.shtml

依赖模块

beautifulsoup4：用于网页元素审查和定位。

pip install beautifulsoup4

lxml：用于网页文档高速解析。

pip install lxml

requests：用于发送和接收网络请求。

pip install requests

完整代码

import concurrent.futures as cf
from bs4 import BeautifulSoup
import os, time, requests


class HeroDownloader(object):
    # 创建头像保存文件夹
    def __init__(self):
        self.root = \'./heros/\'
        if not os.path.exists(self.root):
            os.mkdir(self.root)
        else:
            pass
        self.site = \'https://pvp.qq.com/web201605/herolist.shtml\'

    # 单张头像下载
    def down(self, name, url):
        try:
            res = requests.get(url)
            with open(name, \'wb\') as f:
                f.write(res.content)
        except Exception as e:
            print(e)

    # 进度条打印
    def show(self, num, _sum,  runTime):
        barLen = 20  # 进度条的长度
        perFin = num/_sum
        numFin = round(barLen*perFin)
        numNon = barLen-numFin
        leftTime = (1-perFin)*(runTime/perFin)
        print(
            f"{num:0>{len(str(_sum))}}/{_sum}",
            f"|{\'█\'*numFin}{\' \'*numNon}|",
            f"PROCESS: {perFin*100:.0f}%",
            f"RUN: {runTime:.0f}S",
            f"ETA: {leftTime:.0f}S",
            end=\'\r\'
        )

    # 多线程
    def main(self):
        resp = requests.get(self.site)
        soup = BeautifulSoup(resp.content, \'lxml\')
        hero = soup.select(\'div.herolist-content li a img\') # 头像定位
        total = len(hero) # 总的头像个数 (截至到2020/12/09, 王者荣耀正式服共有93名英雄)
        count = 0
        with cf.ThreadPoolExecutor() as tp:
            futures = []
            t1 = time.time()
            for item in hero:
                alt = item[\'alt\']
                src = item[\'src\']
                name = self.root+alt+\'.jpg\' # 图片名
                url = \'https:\'+src # 图片url
                future = tp.submit(self.down, name, url)
                futures.append(future)
            for future in cf.as_completed(futures):
                count += 1
                t2 = time.time()
                self.show(count, total, t2-t1)
        print()


if __name__ == "__main__":
    HeroDownloader().main()

进度打印

93/93 |████████████████████| PROCESS: 100% RUN: 2S ETA: 0S

头像下载

https://ghgxj.lanzous.com/i9o9fj7j6cd