项目目录

  全站深度爬取图片案例

 爬虫文件setuw.py

 1 # -*- coding: utf-8 -*-
 2 import time
 3 from lxml.html.clean import etree
 4 
 5 import scrapy
 6 from meituwangPro.items import MeituwangproItem
 7 
 8 
 9 class SetuwSpider(scrapy.Spider):
10     name = 'setuw'
11     # allowed_domains = ['http://www.setuw.com/']
12     start_urls = ['http://www.setuw.com']
13     #首页图片分类解析
14     def parse(self, response):
15         dir_list = response.xpath('//ul[@class="as as1"]/a | //ul[@class="as as2"]/a')
16         for i in dir_list:
17             item = MeituwangproItem()
18             item['tag'] = i.xpath('./text()').extract_first()
19             url = self.start_urls[0] + i.xpath('./@href').extract_first()
20 
21             #对图片分类发起请求,获取专辑信息
22             yield scrapy.Request(url, callback=self.parse_second, meta={'item': item})
23             # break
24 
25     #对分类的图片循专辑信息进行解析
26     def parse_second(self, response):
27         item = response.meta['item']
28         back_page = response.xpath('//div[@class="turnpage"]/a[1]/@title').extract_first()
29         #专辑页码是倒着的,判断当前是否循环到第一页
30         if back_page != '上一页(无)':
31             try:
32                 back_url =self.start_urls[0]+response.xpath('//div[@class="turnpage"]/a[1]/@href').extract_first()
33 
34                 li_list = response.xpath('/html/body//div[@class="mntype_contentbg mntype_listall"]//li')
35                 for li in li_list:
36                     url_title=self.start_urls[0]+li.xpath('./a[1]/@href').extract_first()
37 
38                     title=li.xpath('./a[1]/@title').extract_first()
39                     #对专辑连接发送请求,获取图片信息
40                     yield scrapy.Request(url_title, callback=self.parse_img, meta={'item': item,'title':title,'url_title':url_title})
41 
42 
43                 yield scrapy.Request(back_url, callback=self.parse_second, meta={'item': item})
44             except:
45                 pass
46 
47     #解析专辑内的图片链接信息
48     def parse_img(self,response):
49         item=response.meta["item"]
50         item["title"]=response.meta['title']
51         item['title_url']=response.meta['url_title']
52         # print(item['title'],response.meta['url_title'])
53         item['urls']=[]
54         li_lis=response.xpath('//div[@class="small"]/ul/li')
55         for i,li in enumerate(li_lis):
56             # print(i)
57             if i== 0 or i==(len(li_lis)-1):
58                 continue
59             src=li.xpath('./img/@datas').extract_first().split('\'')[-2]
60             item['urls'].append(src)
61 
62         yield item
爬虫文件setuw.py

相关文章: