1、首先创建项目
进入相对应目录,输入scrapy startproject img
2、创建爬虫文件
cd img 输入 scrapy genspider -t basic qiantu 5442.com
3、进入 items文创建保存url地址容器
import scrapy class ImgItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() url = scrapy.Field()
4、分析网站 编写
首页爬取的第一层链接
def parse(self, response): urldata = response.xpath("//div[@class='nav both']//a/@href").extract() print(urldata) for i in range(0, len(urldata)): urllist = urldata[i] yield Request(url=urllist, callback=self.next)
提取第二层链接
def next(self, response): thisurl = response.url # print(thisurl) for j in range(1, 250): if thisurl == "http://www.5442.com/mingxing/": pageurl = thisurl + "list_" + "2_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/qiche/": pageurl = thisurl + "list_" + "3_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/fengjing/": pageurl = thisurl + "list_" + "4_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/youxi/": pageurl = thisurl + "list_" + "5_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/katong/": pageurl = thisurl + "list_" + "6_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/tushuo/": pageurl = thisurl + "list_" + "8_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) if thisurl == "http://www.5442.com/mingxingtuku/": pageurl = thisurl + "list_" + "9_" + str(j) + ".html" yield Request(url=pageurl, callback=self.next1) def next1(self, response): imglist = response.xpath("//div[@class='w650 l']//li//a/@href").extract() # print(imglist) # print(type(imglist)) for i in range(0, len(imglist)): thisurl = imglist[i] # print(thisurl) yield Request(url=thisurl, callback=self.next2)
http://www.5442.com/mingxing/list_2_2.html
由于此网站每个分类的页数没有显示,所以就以每个分类250页为例,分析下一页链接构成(next方法), (next1方法是爬取图片所在链接)
爬取第三层链接(图片的最终地址)
def next2(self, response): imgurllist = response.xpath("//div[@class='arcBody']//a/img/@src").extract() try: for i in range(0, len(imgurllist)): imgurl = imgurllist[i] item = ImgItem() item['url'] = imgurl # print(item['url']) yield item except ValueError as e: pass
5、写pipelines文件
class ImgPipeline(object): def process_item(self, item, spider): # print(item["url"]) try: thisurl = item["url"] # print(thisurl) # 避免名字重复,名字再加上个随机数 file = "F:/peitao/image/" + str(int(random.random() * 100000)) + ".jpg" urllib.request.urlretrieve(thisurl, filename=file) except Exception as e: pass return item要记得在setting文件中 取消注释的
ITEM_PIPELINES = {
'img.pipelines.ImgPipeline': 300,
}