lijifei

导入库

import os
import requests
from bs4 import BeautifulSoup
import time 

生成请求headers

def res_headers():
    headers = {
        \'User-Agent\': \'Mozilla/5.0 \',
        \'Referer\':\'https://i5.meizitu.net/pfiles/style.css?091102\',
    }
    return headers

网站请求

def get_page(url):
    headers=res_headers()
    # 创建session
    s = requests.session()
    s.keep_alive = False
    # 获取页面
    res = s.get(url,headers=headers)
    html = res.text
    return html

  获取页面all girls的详情页url

def get_all_girls(url):
    html = get_page(url)
    # 构建soup页面
    soup = BeautifulSoup(html, \'lxml\')
    # 获取 class_=\'archives\' 下的所有 \'a\'标签
    total_info = soup.find(class_=\'archives\').find_all(\'a\')
    # 遍历 \'a\' 标签,读取\'href\'值
    all_list=[]
    for girls_info in total_info:
        link_url = girls_info[\'href\']
        all_list.append(link_url)
    return all_list  

获取girl的所有图片url

def get_girl_all_page(url):
    html=get_page(url)
    soup = BeautifulSoup(html,\'lxml\')
    # 在 class_=\'pagenavi\' 中的倒数第3个标签,读取 \'span\' 的值(图片数量)
    max_page = soup.find(class_=\'pagenavi\',).find_all(\'a\')[-2].find(\'span\').string
    title = soup.find(class_=\'main-title\').string
    # 循环读取详情页面中的\'img\'标签中的\'src\'值
    pic_url_list = []
    for i in range(int(max_page)):
        html = get_page(url + "/%s"  %(i+1))
        # print(html)
        soup = BeautifulSoup(html,\'lxml\')
        # print(soup.text)
        # pic_url = soup.find(\'img\').get(\'src\')
        pic_url = soup.find(\'img\').get(\'src\')
        # print(pic_url)
        pic_url_list.append(pic_url)
        time.sleep(0.1)
    # print(pic_url_list)
    download_Pic(title,pic_url_list)

 下载图片,以标题为文件夹名

def download_Pic(title, pic_url_list):
    # 新建文件夹,路径
    os.mkdir(title)
    headers = res_headers()
    # 自定义序列号
    j = 1
    # 下载图片
    for item in pic_url_list:
        # 定义文件路径及名称
        filename = \'%s/%s.jpg\' % (title, str(j))
        print(\'downloading....%s : NO.%s\' % (title, str(j)))
        with open(filename, \'wb\') as f:
            img = requests.get(item, headers=headers).content
            f.write(img)
            f.close()
        j += 1
    time.sleep(100)

  主程序

if __name__ == \'__main__\':
    url = "https://www.mzitu.com/all"
    pic_list = get_all_girls(url)
    for i in pic_list:
        get_girl_all_page(i)

  *本文根据崔老师视频及自己实际测试得出,仍存在请求的问题,有待后续改进

 

 

 

 

分类:

技术点:

相关文章: