python爬虫爬取图片

python爬取天极网图片

使用python爬取天极网图片使用python爬取天极网图片，需要导入requests模块，以及os，bs4模块，获取网页地址，打开HTML页面，分析代码结构，查找图片对应的div标签，对应的class属性，再找每张图片对应的子标签，找出其中包含的img标签，对应的url地址，拿到url之后，使用requests发送请求，将其保存在django项目下的img文件夹中。

# low  版
# 爬取页面显示的所有缩略图

import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
print(base_path)
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')

soup = BeautifulSoup(response.text,\'html.parser\')     # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签（每一张图片的外部标点）
list_dl = div_obj.find_all(name=\'dl\')
for dl in list_dl:    # 每一张图片的dl
    # 从dl中找img，要考虑图片是否唯一
    img = dl.find(name=\'img\')
    #从标签中获取属性，用标签对象点get
    img_src = img.get(\'src\')
    #拿到img的url,使用requests 发请求
    img_response = requests.get(img_src)
    #处理路径和文件名
    file_path = os.path.join(img_path,img_src.rsplit(\'/\',1)[-1])
    with open(file_path,\'wb\') as f:
        f.write(img_response.content)




# 进阶版
#点击图片后显示的几张缩略图
import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')

soup = BeautifulSoup(response.text,\'html.parser\')     # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签（每一张图片的外部标点）

list_dd = div_obj.find_all(name=\'dd\')
for dd in list_dd:    # 每一张图片的dl
    a_obj = dd.find(\'a\')
    # print(a_obj.text)

    # 拼接文件夹的路径，并创建文件夹
    dir_path = os.path.join(img_path,a_obj.text)
    if not os.path.isdir(dir_path):   # 判断文件夹是否存在
        os.mkdir(dir_path)

    a_response = requests.get(a_obj.get(\'href\'))
    a_response.encoding = \'GBK\'
    soup2 = BeautifulSoup(a_response.text,\'html.parser\')
    div_obj2 = soup2.find(name=\'div\',attrs={\'class\':\'overview\'})
    # print(div_obj2)
    # try:
    img_list = div_obj2.find_all(name=\'img\')
    for img in img_list:
        img_src = img.get(\'src\')
        img_response = requests.get(img_src)
        file_path = os.path.join(dir_path,img_src.rsplit(\'/\',1)[-1])
        with open(file_path,\'wb\') as f:
            f.write(img_response.content)
    # except Exception as e:
    break



#高清图
# 每张图片下所有的高清图
import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')

soup = BeautifulSoup(response.text,\'html.parser\')     # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签（每一张图片的外部标点）

list_dd = div_obj.find_all(name=\'dd\')
for dd in list_dd:    # 每一张图片的dl
    a_obj = dd.find(\'a\')
    # print(a_obj.text)

    # 拼接文件夹的路径，并创建文件夹
    dir_path = os.path.join(img_path,a_obj.text)
    if not os.path.isdir(dir_path):   # 判断文件夹是否存在
        os.mkdir(dir_path)

    a_response = requests.get(a_obj.get(\'href\'))
    a_response.encoding = \'GBK\'
    soup2 = BeautifulSoup(a_response.text,\'html.parser\')
    div_obj2 = soup2.find(name=\'div\',attrs={\'class\':\'overview\'})
    # print(div_obj2)
    try:
        img_list = div_obj2.find_all(name=\'img\')

        for img in img_list:
            img_src = img.get(\'src\')
            img_response = requests.get(img_src.replace(\'113x113\',\'740x-\'))  # 路径替换
            file_path = os.path.join(dir_path,img_src.rsplit(\'/\',1)[-1])
            with open(file_path,\'wb\') as f:
                f.write(img_response.content)
    except Exception as e:
        pass

多进程/多线程爬取五张页面所用高清图

import threading
import os
import requests  # 发送请求
from bs4 import BeautifulSoup  # 解析文本

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
from multiprocessing import cpu_count  # 获取本机的CPU核数

base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, \'img\')

def picture(num):

    response = requests.get(f\'http://pic.yesky.com/c/6_20491_{num}.shtml\')

    soup = BeautifulSoup(response.text, \'html.parser\')  # 将请求结果交给BS4解析
    div_obj = soup.find(name=\'div\', attrs={\'class\': \'lb_box\'})  # 经过分析之后定位到指定div
    # 从div中找到所有的dl标签（每一张图片的外部标点）

    list_dd = div_obj.find_all(name=\'dd\')
    for dd in list_dd:  # 每一张图片的dl
        a_obj = dd.find(\'a\')
        # 拼接文件夹的路径，并创建文件夹
        dir_path = os.path.join(img_path, a_obj.text)
        if not os.path.isdir(dir_path):  # 判断文件夹是否存在
            os.mkdir(dir_path)
        a_response = requests.get(a_obj.get(\'href\'))
        a_response.encoding = \'GBK\'
        soup2 = BeautifulSoup(a_response.text, \'html.parser\')
        div_obj2 = soup2.find(name=\'div\', attrs={\'class\': \'overview\'})
        # print(div_obj2)
        try:
            img_list = div_obj2.find_all(name=\'img\')

            for img in img_list:
                img_src = img.get(\'src\')
                img_response = requests.get(img_src.replace(\'113x113\', \'740x-\'))  # 路径替换
                file_path = os.path.join(dir_path, img_src.rsplit(\'/\', 1)[-1])
                with open(file_path, \'wb\') as f:
                    f.write(img_response.content)
        except Exception as e:
            pass


if __name__ == "__main__":
    import time
    start = time.time()

    # 进程池
    # p = ProcessPoolExecutor(max_workers=cpu_count())
    # # print(cpu_count())
    # for i in range(1,6):
    #     p.submit(func, i)
    # p.shutdown()

    # 线程池
    t = ThreadPoolExecutor(max_workers=cpu_count())
    for i in range(1,6):
        t.submit(picture,i)
    t.shutdown()
    print(\'执行时间:{}\'.format(time.time()-start))

    # for i in range(1,6):
    #     a = threading.Thread(target=picture,args=(i,))
    #     a.start()