【发布时间】:2016-08-04 16:22:24
【问题描述】:
我的蜘蛛运行没有显示任何错误,但图像没有存储在文件夹中这是我的scrapy文件:
Spider.py:
import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["someurl.com"]
start_urls = [
"someurl.com"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item = ListResidentialItem()
try:
image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract())
item['image_urls'] = [ x for x in image_urls]
except IndexError:
item['image_urls'] = ''
return item
settings.py:
from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'production'
SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
DEPTH_PRIORITY = 1
IMAGE_STORE = '/images'
CONCURRENT_REQUESTS = 250
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
}
items.py
# -*- coding: utf-8 -*-
import scrapy
class ProductionItem(scrapy.Item):
img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass
我的管道文件是空的,我不确定要添加到 pipeline.py 文件中的内容。
非常感谢任何帮助。
【问题讨论】: