爬取卡通图片

因为想找个网站练练手，发现这个网站不错，所以对这个网站的部分图片进行了爬取，主要是卡通图片什么的，请直接查看代码。
这个还是初版，后面会有一个升级版多线程版。
# -*- coding: utf-8 -*-
# by wangcc
# mail:wangcc_sd@163.com

import requests
import sys
import io
import os
from bs4 import BeautifulSoup
import asyncio
import json

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=\'utf8\')  ###改变标准输出的默认编码

headers = {
    \'User-agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36\'}

async def get_url(queue,url):
    print(url)
    response  = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,\'html.parser\')
    for line in range(len(soup.find_all(\'p\', class_=\'list_h\'))):
        divObj = soup.find_all(\'p\', class_=\'list_h\')[line]
        for i in divObj:
            #print(divObj)
            await queue.put(divObj)
    await asyncio.sleep(1)

def get_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, \'html.parser\')
    soup_page = soup.find_all(\'a\', text="末页")
    try:
        page = str(str(soup_page).split(\'_\')[1]).split(\'.\')[0]
    except IndexError as e:
        print(e)
        return 0
    return page

def get_url_second(url):
    page = get_page(url)
    url_list = [url.split(\'.html\')[0] + \'_{}.html\'.format(i) for i in range(int(page)) if i>=2 ]
    url_list.append(url)
    for index in range(len(url_list)):
        url = url_list[index]
        #print(url)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, \'html.parser\')
        one_jpg=soup.find_all(\'script\', type="application/ld+json")
        reversed_news_arr=[]
        for single_news in one_jpg:
            reversed_news_arr.append(single_news)
        script_info=(str(reversed_news_arr)[37:-10])
        script_dict = json.loads(script_info)
        id = script_dict["@id"]
        title = script_dict["title"]
        images = script_dict["images"][0]
        jpg_name = str(images).split(\'/\')[-1]
        jpg_name="./date/" + title +\'/\'+ jpg_name
        jpg_index = requests.get(images)
        with open(jpg_name, \'wb\')as jpg:
            jpg.write(jpg_index.content)
            jpg.close()



def dir_save(dir_name):
    path=\'./date\'
    if not os.path.exists(path+\'/\'+dir_name):
        os.mkdir(path+\'/\'+dir_name)

async def consumer(queue):
    while True:
        print(\'qsize--->\',queue.qsize())
        divObj = await queue.get()
        href=divObj.a.get(\'href\')
        title=divObj.a.get(\'title\')
        dir_save(title)
        url ="https://www.uumtu.com"+href
        get_url_second(url)
        continue
        await asyncio.sleep(1)



async def main():
    queue = asyncio.Queue()
    for i in range(1,50):
        #url=\'https://www.uumtu.com/meinv/list_{}.html\'.format(i)
        url = \'https://www.uumtu.com/katong/list_{}.html\'.format(i)
        print(url)
        producer_1 = asyncio.create_task(get_url(queue, url))
        consumer_1 = asyncio.create_task(consumer(queue))

if __name__ == \'__main__\':
    for i in range(931):
        pass
    asyncio.run(main())