Spider
这是一个基础的爬虫类,其他所有的爬虫类都继承于这个Spider类class Spider(object_ref):
class Spider(object_ref):
"""
Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, \'name\', None): raise ValueError("%s must have a name" % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, \'start_urls\'): self.start_urls = [] def start_requests(self): cls = self.__class__ if not self.start_urls and hasattr(self, \'start_url\'): raise AttributeError( "Crawling could not start: \'start_urls\' not found " "or empty (but found \'start_url\' attribute instead, " "did you miss an \'s\'?)") if method_is_overridden(cls, Spider, \'make_requests_from_url\'): warnings.warn( "Spider.make_requests_from_url method is deprecated; it " "won\'t be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True) def make_requests_from_url(self, url): """ This method is deprecated. """ return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError(\'{}.parse callback is not defined\'.format(self.__class__.__name__))
在__init__方法中我们需要设定name,start_urls 两个属性,当调用了start_requests 方法后,才会开始开始执行爬取动作,调用make_requests_from_url 方法生成爬取请求,
了解到了Spider类中爬取请求的产生过程,我们就可以根据自己的需求来重写start_requests 方法。
而parse方法当我们忘记重写了的时候,就会抛出一个NotImplementedError的异常
CrawlSpider
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response):
return []
def parse_start_url(self, response):
return []
由于parse在CrawlSpider内部使用了,是不能重写,
但是 parse_start_url 这个方法是可以重写,用以分析传入的response,返回一个item或者request,或者二者都有的可迭代对象