【发布时间】:2015-07-13 04:24:11
【问题描述】:
首先这是我的代码-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[@class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[@class="name"]/a/@href').extract())[3:-2]
point1 = sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[4]/@class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[@class="product-thumb"]/div[@class="small_detail"]/div[not (@class="name") or not(@class="description") or not(@class="qty") or not(@class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[@class="product-thumb"]/div[@class="image"]/a[not (@class="sft_quickshop_icon")]/img[@class="img-responsive"]/@src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
现在,这些是我面临的问题。
1. 可以使用此 xpath 找到许多 div - '//html/body/div/div/div[4]/div/div/div[5]/div ' .但是,上面的代码仅抓取第一个 div 的内容,即具有 xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' ,而不是全部。
在我评论这三行的那一刻,刮板刮掉了所有东西,显然我无法在项目中添加“添加”字段-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
所以,除了我的项目类中的“添加”字段(注意 DmozItem 类)之外,我还想抓取所有 div。我怎么做?请为我的具体案例提供更正的代码,这样最好!
2.其次,正如我所说,当我评论上面提到的三行代码时,程序会在接近 5 秒(大约 4.9 秒)的时间内抓取所有内容。 p>
但是一旦我取消注释,那 3 行(同样是我上面提到的那些),程序的运行时间就会大大超过,它的运行时间接近 9 秒(大约 8.8 - 8.9 秒)。为什么会这样?是不是因为这个 - dont_filter=True?请提出解决这个问题的方法,因为运行时对我来说是一个非常大的问题。另外,我可以以某种方式减少 5 秒(大约 4.9 秒)的初始时间吗?
【问题讨论】:
标签: python callback web-scraping scrapy scrapy-spider