python爬取天极网图片
使用python爬取天极网图片使用python爬取天极网图片,需要导入requests模块,以及os,bs4模块,获取网页地址,打开HTML页面,分析代码结构,查找图片对应的div标签,对应的class属性,再找每张图片对应的子标签,找出其中包含的img标签,对应的url地址,拿到url之后,使用requests发送请求,将其保存在django项目下的img文件夹中。
# low 版
# 爬取页面显示的所有缩略图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
print(base_path)
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')
soup = BeautifulSoup(response.text,\'html.parser\') # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dl = div_obj.find_all(name=\'dl\')
for dl in list_dl: # 每一张图片的dl
# 从dl中找img,要考虑图片是否唯一
img = dl.find(name=\'img\')
#从标签中获取属性,用标签对象点get
img_src = img.get(\'src\')
#拿到img的url,使用requests 发请求
img_response = requests.get(img_src)
#处理路径和文件名
file_path = os.path.join(img_path,img_src.rsplit(\'/\',1)[-1])
with open(file_path,\'wb\') as f:
f.write(img_response.content)
# 进阶版
#点击图片后显示的几张缩略图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')
soup = BeautifulSoup(response.text,\'html.parser\') # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name=\'dd\')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find(\'a\')
# print(a_obj.text)
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path,a_obj.text)
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = requests.get(a_obj.get(\'href\'))
a_response.encoding = \'GBK\'
soup2 = BeautifulSoup(a_response.text,\'html.parser\')
div_obj2 = soup2.find(name=\'div\',attrs={\'class\':\'overview\'})
# print(div_obj2)
# try:
img_list = div_obj2.find_all(name=\'img\')
for img in img_list:
img_src = img.get(\'src\')
img_response = requests.get(img_src)
file_path = os.path.join(dir_path,img_src.rsplit(\'/\',1)[-1])
with open(file_path,\'wb\') as f:
f.write(img_response.content)
# except Exception as e:
break
#高清图
# 每张图片下所有的高清图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,\'img\')
response = requests.get(\'http://pic.yesky.com/c/6_20491_1.shtml\')
soup = BeautifulSoup(response.text,\'html.parser\') # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\',attrs={\'class\':\'lb_box\'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name=\'dd\')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find(\'a\')
# print(a_obj.text)
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path,a_obj.text)
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = requests.get(a_obj.get(\'href\'))
a_response.encoding = \'GBK\'
soup2 = BeautifulSoup(a_response.text,\'html.parser\')
div_obj2 = soup2.find(name=\'div\',attrs={\'class\':\'overview\'})
# print(div_obj2)
try:
img_list = div_obj2.find_all(name=\'img\')
for img in img_list:
img_src = img.get(\'src\')
img_response = requests.get(img_src.replace(\'113x113\',\'740x-\')) # 路径替换
file_path = os.path.join(dir_path,img_src.rsplit(\'/\',1)[-1])
with open(file_path,\'wb\') as f:
f.write(img_response.content)
except Exception as e:
pass
多进程/多线程爬取五张页面所用高清图
import threading
import os
import requests # 发送请求
from bs4 import BeautifulSoup # 解析文本
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
from multiprocessing import cpu_count # 获取本机的CPU核数
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, \'img\')
def picture(num):
response = requests.get(f\'http://pic.yesky.com/c/6_20491_{num}.shtml\')
soup = BeautifulSoup(response.text, \'html.parser\') # 将请求结果交给BS4解析
div_obj = soup.find(name=\'div\', attrs={\'class\': \'lb_box\'}) # 经过分析之后定位到指定div
# 从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name=\'dd\')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find(\'a\')
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path, a_obj.text)
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = requests.get(a_obj.get(\'href\'))
a_response.encoding = \'GBK\'
soup2 = BeautifulSoup(a_response.text, \'html.parser\')
div_obj2 = soup2.find(name=\'div\', attrs={\'class\': \'overview\'})
# print(div_obj2)
try:
img_list = div_obj2.find_all(name=\'img\')
for img in img_list:
img_src = img.get(\'src\')
img_response = requests.get(img_src.replace(\'113x113\', \'740x-\')) # 路径替换
file_path = os.path.join(dir_path, img_src.rsplit(\'/\', 1)[-1])
with open(file_path, \'wb\') as f:
f.write(img_response.content)
except Exception as e:
pass
if __name__ == "__main__":
import time
start = time.time()
# 进程池
# p = ProcessPoolExecutor(max_workers=cpu_count())
# # print(cpu_count())
# for i in range(1,6):
# p.submit(func, i)
# p.shutdown()
# 线程池
t = ThreadPoolExecutor(max_workers=cpu_count())
for i in range(1,6):
t.submit(picture,i)
t.shutdown()
print(\'执行时间:{}\'.format(time.time()-start))
# for i in range(1,6):
# a = threading.Thread(target=picture,args=(i,))
# a.start()