【发布时间】:2017-08-25 12:49:24
【问题描述】:
我正在尝试构建这个爬虫来从 craigslist 获取住房数据,
但是爬虫在抓取到第一页后就停止了,没有转到下一页。
这是代码,它适用于第一页,但看在上帝的份上,我不明白为什么它不能进入下一页。任何见解都非常感谢。我关注了this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[@class="button next"]/@href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
【问题讨论】:
标签: python-2.7 scrapy