没事想爬下数据,就入了scrapy坑,跟着https://zhuanlan.zhihu.com/data-factory这篇教程走,中间被小数量的网站坑过,不过还是写出了爬虫~~
切糕王子:毫无防御,直接scan就可以了;
尚妆网:进这个网站时才发现,这和说好的不一样!!!这个网站也是采用了拖动到底才加载,不过貌似不能拖到底,要移到稍微上面才能加载?不过那个教程说的根据探查到的url找出规律,再伪装参数这点还是有用的
批量爬取商品url代码(只有ShowjoySpider):
# -*- coding: utf-8 -*- import time import sys import random reload(sys) sys.setdefaultencoding("utf-8") from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from tutorial.items import TutorialItem from scrapy.http.request import Request import re class ShowjoySpider(BaseSpider): name = "Showjoy" allowed_domains = ["showjoy.com"] start_urls = [ "http://list.m.showjoy.com/search/?q=cateIds%3A1,cateName%3A%E5%A5%97%E8%A3%85&stock=1" ] # to keep the login status cookies = {} # pretend to visit page by computer/smart phone headers = { 'Connection': 'keep - alive', 'User-Agent': 'Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0' # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } # to solve the response meta = { 'dont_redirect': False, # don't ban the redirect 'handle_httpstatus_list': [301, 302] # solve the exception } def toFile(self, str, fileName): file = open(fileName, "a") file.write(str) file.write('\n') file.close() def start_requests(self): """ override function """ yield Request(self.start_urls[0], callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta) def parse(self, response): print 'analyse starting' body = response.body linklist = re.findall(r'http://item.m.showjoy.com/sku/[0-9]+.html',body) # token = sele.select('//input[@name="_synToken"]/@value').extract()[0] print 'len=' + str(len(linklist)) if(len(linklist) == 0): return # print 'token=' + str(token) self.toFile(str(linklist), "urlList.txt") newurl = self.start_urls[0] + '&page=' exresult = re.search(r'page=(\d+)',response.url) if(exresult == None): print 'page 2' # newurl += '2&_synToken=' + str(token) newurl += '2' else: print 'page n' newpagenum = int(exresult.group(1)) + 1 # newurl += str(newpagenum) + '&_synToken=' + str(token) newurl += str(newpagenum) time.sleep(1) yield Request(newurl, callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta)