导入库
import os import requests from bs4 import BeautifulSoup import time
生成请求headers
def res_headers():
headers = {
\'User-Agent\': \'Mozilla/5.0 \',
\'Referer\':\'https://i5.meizitu.net/pfiles/style.css?091102\',
}
return headers
网站请求
def get_page(url):
headers=res_headers()
# 创建session
s = requests.session()
s.keep_alive = False
# 获取页面
res = s.get(url,headers=headers)
html = res.text
return html
获取页面all girls的详情页url
def get_all_girls(url):
html = get_page(url)
# 构建soup页面
soup = BeautifulSoup(html, \'lxml\')
# 获取 class_=\'archives\' 下的所有 \'a\'标签
total_info = soup.find(class_=\'archives\').find_all(\'a\')
# 遍历 \'a\' 标签,读取\'href\'值
all_list=[]
for girls_info in total_info:
link_url = girls_info[\'href\']
all_list.append(link_url)
return all_list
获取girl的所有图片url
def get_girl_all_page(url):
html=get_page(url)
soup = BeautifulSoup(html,\'lxml\')
# 在 class_=\'pagenavi\' 中的倒数第3个标签,读取 \'span\' 的值(图片数量)
max_page = soup.find(class_=\'pagenavi\',).find_all(\'a\')[-2].find(\'span\').string
title = soup.find(class_=\'main-title\').string
# 循环读取详情页面中的\'img\'标签中的\'src\'值
pic_url_list = []
for i in range(int(max_page)):
html = get_page(url + "/%s" %(i+1))
# print(html)
soup = BeautifulSoup(html,\'lxml\')
# print(soup.text)
# pic_url = soup.find(\'img\').get(\'src\')
pic_url = soup.find(\'img\').get(\'src\')
# print(pic_url)
pic_url_list.append(pic_url)
time.sleep(0.1)
# print(pic_url_list)
download_Pic(title,pic_url_list)
下载图片,以标题为文件夹名
def download_Pic(title, pic_url_list):
# 新建文件夹,路径
os.mkdir(title)
headers = res_headers()
# 自定义序列号
j = 1
# 下载图片
for item in pic_url_list:
# 定义文件路径及名称
filename = \'%s/%s.jpg\' % (title, str(j))
print(\'downloading....%s : NO.%s\' % (title, str(j)))
with open(filename, \'wb\') as f:
img = requests.get(item, headers=headers).content
f.write(img)
f.close()
j += 1
time.sleep(100)
主程序
if __name__ == \'__main__\':
url = "https://www.mzitu.com/all"
pic_list = get_all_girls(url)
for i in pic_list:
get_girl_all_page(i)
*本文根据崔老师视频及自己实际测试得出,仍存在请求的问题,有待后续改进