"'
需求:爬取必应的图片,并下载,以老师为例;
'"
1.首先通过XHR找到图片的url地址,打开该地址可看到
确认链接找到正确,然后多找几个来分析,可以得到通用的url:
url = 'https://cn.bing.com/images/async?q='+ '必应输入框内需要搜索的名字'+'&first='+'页数'+'&count=35&relp=35&scenario=ImageBasicHover&datsrc=N_I&layout=RowBased&mmasync=1
'
那么老师图片的url为:https://cn.bing.com/images/async?q=老师&first='+'页数'+'&count=35&relp=35&scenario=ImageBasicHover&datsrc=N_I&layout=RowBased&mmasync=1
通过xpath解析该url可发现图片链接路径: img_url = html.xpath('//a[@class="iusc"]/@m')
得到每张图片的url,可以开启下载了;
------------------------------------------------------------------分割线----------------------------------------------------------------------------------------
代码如下:
import requests
from lxml import etree
import re
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
# 保存图片
def save_img(url):
img_name = url[-10:]
name = re.sub('/', '', img_name) # img_name中出现/,将其置换成空
try:
res = requests.get(url, headers=headers)
except OSError:
print('出现错误,错误的url是:', url)
else:
with open('img/'+name, 'wb')as f:
try:
f.write(res.content)
except OSError:
print('无法保存,url是:', url)
# 获取全部图片url
def parse_img(url):
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
data = response.content.decode('utf-8', 'ignore')
html = etree.HTML(data)
conda_list = html.xpath('//a[@class="iusc"]/@m')
all_url = [] # 用来保存全部的url
for i in conda_list:
img_url = re.search('"murl":"(.*?)"', i).group(1)
all_url.append(img_url)
return all_url
# 主函数
def main():
for i in range(0, 120, 35):
url = 'https://cn.bing.com/images/async?q=%E8%80%81%E5%B8%88&first='+str(i)+'&count=35&relp=35&scenario=ImageBasicHover&datsrc=N_I&layout=RowBased&mmasync=1'
img_data = parse_img(url)
for img_url in img_data:
save_img(img_url)
print(img_url)
time.sleep(10)
if __name__ == '__main__':
main()