【发布时间】:2019-03-13 18:31:41
【问题描述】:
我在使用 python 获取下一页链接时遇到问题。
代码
import scrapy
from scrapy.http import Request
from gharbheti.items import GharbhetiItem
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Identity, MapCompose, Join, Compose
from urllib.parse import urljoin
class ListSpider(scrapy.Spider):
name = 'list'
allowed_domains = ['gharbheti.com']
start_urls = ['https://www.gharbheti.com/sale','https://www.gharbheti.com/rent']
def parse(self, response):
properties=response.xpath('//li[@class="col-md-6 Search_building"]/descendant::a')
for property in properties:
link=property.xpath('./@href').extract_first()
urls=response.urljoin(link)
yield Request(urls,callback=self.parse_property, meta={'URL':urls, })
def parse_property(self, response):
l = ItemLoader(item=GharbhetiItem(), response=response)
URL=response.meta.get('URL')
l.add_value('URL', response.url)
l.add_xpath('Title','//div[@class="product-page-meta"]/h4/em/text()',MapCompose(str.strip,str.title))
l.add_xpath('Offering','//figcaption[contains(text(), "For Sale")]/text()|//figcaption[contains(text(),"For Rent")]/text()',MapCompose(lambda i:i.replace('For',''),str.strip))
l.add_xpath('Price','//div[@class="deal-pricebox"]/descendant::h3/text()',MapCompose(str.strip))
l.add_xpath('Type','//ul[@class="suitable-for"]/li/text()',MapCompose(str.strip))
bike_parking=response.xpath('//i[@class="fa fa-motorcycle"]/following-sibling::em/text()').extract_first()
car_parking=response.xpath('//i[@class="fa fa-car"]/following-sibling::em/text()').extract_first()
parking=("Bike Parking: {} Car Parking: {}".format(bike_parking,car_parking))
l.add_value('Parking',parking)
l.add_xpath('Description','//div[@class="comment more"]/text()',MapCompose(str.strip))
l.add_xpath('Bedroom','//i[@class="fa fa-bed"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Bed Room:',''),str.strip,int))
l.add_xpath('Livingroom','//i[@class="fa fa-inbox"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Living Room:',''),str.strip,int))
l.add_xpath('Kitchen','//i[@class="fa fa-cutlery"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total kitchen Room:',''),str.strip,int))
l.add_xpath('Bathroom','//i[@class="fa fa-puzzle-piece"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Toilet/Bathroom:',''),str.strip,int))
l.add_xpath('Address','//b[contains(text(), "Map")]/text()',MapCompose(lambda i:i.replace('Map Loaction :-',''),str.strip))
l.add_xpath('Features','//div[@class="list main-list"]/ul/li/text()',MapCompose(str.strip))
images=response.xpath('//div[@class="carousel-inner dtl-carousel-inner text-center"]/descendant::img').extract()
images=[s.replace('<img src="', '') for s in images]
images=[i.split('?')[0] for i in images]
Image=["http://www.gharbheti.com" + im for im in images]
l.add_value('Images',Image)
return l.load_item()
无法从网络检索下一页 对于另一个站点,这就是我所做的(没有 javascript 的简单分页
next_page=response.urljoin(response.xpath('//a[contains(text(), "Next")]/@href').extract_first()
yield Request(next_page, callback=self.parse)
【问题讨论】:
-
欢迎来到 StackOverflow。您的问题不太符合 StackOverflow 期望的标准。您当前状态下的问题可能不会被接受。我强烈建议您按照this StackOverflow article 的准则编辑您的问题
标签: javascript python pagination scrapy