【发布时间】:2015-11-09 01:25:10
【问题描述】:
一切正常,但每次点击时我都无法提取新标签的网址?我错过了什么我已经尝试了所有可能的资源 - current_url,response.url,getCurrentUrl 似乎都不起作用。在这种情况下,最好的解决方案是什么?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
from selenium.webdriver.common.keys import Keys
class CompItem(scrapy.Item):
model_name = scrapy.Field()
model_link = scrapy.Field()
url =scrapy.Field()
class criticspider(CrawlSpider):
name = "paytm_l"
allowed_domains = ["paytm.com"]
start_urls = ["https://paytm.com/shop/g/electronics/mobile-accessories/mobiles"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
#sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')
self.browser.implicitly_wait(30)
items = []
time.sleep(20)
# for i in range(0,200):
#self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sel = Selector(text=self.browser.page_source)
sites = sel.xpath('//div[contains(@class,"overflow-hidden")]')
item = CompItem()
for r in range(1,5):
#item['model_name'] = site.xpath('.//p[contains(@ng-if,"applyLimit")]/text()')
button = self.browser.find_element_by_xpath("/html/body/div[5]/div[5]/div/div[5]/div[3]/ul/li[%d]/a"%r)
main_window = self.browser.current_window_handle
button.send_keys(Keys.CONTROL + Keys.RETURN)
self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)
time.sleep(5)
self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'l')
self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'c')
item["url"]=self.browser.current_url()
time.sleep(10)
self.browser.switch_to_window(main_window)
time.sleep(10)
self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w')
self.browser.switch_to_window(main_window)
#item['model_link'] = site.xpath('//a[contains(@class,"{"na": !productClasses(product)}"]/@href').extract()[0]
yield item
【问题讨论】:
标签: python selenium xpath selenium-webdriver scrapy