您可以设置进程的设置:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
但如果你真的想在蜘蛛内部做这一切,你可以覆盖“update_settings”方法:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
@classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
但老实说,我认为第一种方式更好......