wangcc7

因为想找个网站练练手,发现这个网站不错,所以对这个网站的部分图片进行了爬取,主要是卡通图片什么的,请直接查看代码。
这个还是初版,后面会有一个升级版多线程版。

# -*- coding: utf-8 -*-
# by wangcc
# mail:wangcc_sd@163.com

import requests
import sys
import io
import os
from bs4 import BeautifulSoup
import asyncio
import json

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=\'utf8\')  ###改变标准输出的默认编码

headers = {
    \'User-agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36\'}

async def get_url(queue,url):
    print(url)
    response  = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,\'html.parser\')
    for line in range(len(soup.find_all(\'p\', class_=\'list_h\'))):
        divObj = soup.find_all(\'p\', class_=\'list_h\')[line]
        for i in divObj:
            #print(divObj)
            await queue.put(divObj)
    await asyncio.sleep(1)

def get_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, \'html.parser\')
    soup_page = soup.find_all(\'a\', text="末页")
    try:
        page = str(str(soup_page).split(\'_\')[1]).split(\'.\')[0]
    except IndexError as e:
        print(e)
        return 0
    return page

def get_url_second(url):
    page = get_page(url)
    url_list = [url.split(\'.html\')[0] + \'_{}.html\'.format(i) for i in range(int(page)) if i>=2 ]
    url_list.append(url)
    for index in range(len(url_list)):
        url = url_list[index]
        #print(url)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, \'html.parser\')
        one_jpg=soup.find_all(\'script\', type="application/ld+json")
        reversed_news_arr=[]
        for single_news in one_jpg:
            reversed_news_arr.append(single_news)
        script_info=(str(reversed_news_arr)[37:-10])
        script_dict = json.loads(script_info)
        id = script_dict["@id"]
        title = script_dict["title"]
        images = script_dict["images"][0]
        jpg_name = str(images).split(\'/\')[-1]
        jpg_name="./date/" + title +\'/\'+ jpg_name
        jpg_index = requests.get(images)
        with open(jpg_name, \'wb\')as jpg:
            jpg.write(jpg_index.content)
            jpg.close()



def dir_save(dir_name):
    path=\'./date\'
    if not os.path.exists(path+\'/\'+dir_name):
        os.mkdir(path+\'/\'+dir_name)

async def consumer(queue):
    while True:
        print(\'qsize--->\',queue.qsize())
        divObj = await queue.get()
        href=divObj.a.get(\'href\')
        title=divObj.a.get(\'title\')
        dir_save(title)
        url ="https://www.uumtu.com"+href
        get_url_second(url)
        continue
        await asyncio.sleep(1)



async def main():
    queue = asyncio.Queue()
    for i in range(1,50):
        #url=\'https://www.uumtu.com/meinv/list_{}.html\'.format(i)
        url = \'https://www.uumtu.com/katong/list_{}.html\'.format(i)
        print(url)
        producer_1 = asyncio.create_task(get_url(queue, url))
        consumer_1 = asyncio.create_task(consumer(queue))

if __name__ == \'__main__\':
    for i in range(931):
        pass
    asyncio.run(main())


分类:

技术点:

相关文章:

  • 2021-07-02
  • 2021-09-01
  • 2021-11-28
  • 2022-02-04
  • 2022-01-03
  • 2021-11-09
  • 2021-08-07
  • 2021-09-24
猜你喜欢
  • 2021-10-25
  • 2021-09-11
  • 2022-12-23
  • 2021-12-03
  • 2021-09-28
  • 2021-09-07
相关资源
相似解决方案