【发布时间】:2015-10-12 20:27:44
【问题描述】:
我会用 scrapy 从 squawka.com 抓取一些数据。代码如下所示。首先我进入结果页面。在这里,我搜索来自德甲联赛的详细比赛统计链接,它还会抓取所有分页链接并发送请求。一切正常,但仅在“for Liga in Ligas loop”中,由于背后的动态内容,我提出了一个无法正常工作的请求。如果链接已加载,我可以抓取团队名称、分数、日期、竞技场名称,但我无法抓取例如主队的射门 (item['ShotsA'] = response.xpath("//div[@id='mc-stat-shot']/div/div[@class='team2-data']/text ()").extract() 在我抓取时为空),因为动态行为。我用硒尝试了一些事情,但没有成功。有人可以用请求命令或硒帮助我吗?
import scrapy
from soccer.items import SoccerItem
from selenium import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
class SquawkaSpider(scrapy.Spider):
name = "soccer"
allowed_domains = ["squawka.com"]
start_urls = ["http://www.squawka.com/match-results"]
def __init__(self):
#self.driver = webdriver.Chrome(executable_path='/Users/fabian/chromedriver')
#self.driver = webdriver.Firefox()
def parse(self, response):
#self.driver.get("response.url")
#self.driver.find_element_by_xpath("//*[@id='league-filter-list']/option[contains(text(), 'Top 5 European Leagues')]").click()
Matches = response.xpath("//tr[@class='match-today']")
counter = response.xpath("//div/center/div[@id='sq-pagination']/span/span[@class='page-numbers current']/text()").extract()[0]
if counter == "1":
yield scrapy.Request(url="http://www.squawka.com/match-results?ctl=22_s2014", cookies={'firsttime_new': '1', 'sqhome_competition': '126', 'sqhome_competitionidinfeed': '22', 'sqhome_competitionteam': '0', 'sqhome_seasonid': '2014', 'timeZone': '2.0'}, callback=self.parse)
for Match in Matches:
Ligas = Match.xpath("td[@class='match-league']/a/text()").extract()
for Liga in Ligas:
if Liga == "Bundesliga":
Matchlinks = Match.xpath("td[@class='match-centre']/a/@href").extract()
yield scrapy.Request(Matchlinks[0], callback=self.parse_match_link)
if counter == "1":
page = response.xpath("//div/center/div[@id='sq-pagination']/span/a[@class='pageing_text_arrow'][1]/@href")
if counter != "1":
page = response.xpath("//div/center/div[@id='sq-pagination']/span/a[@class='pageing_text_arrow'][3]/@href")
if page:
next = page.extract()
yield scrapy.Request(next[0], cookies={'firsttime_new': '1', 'sqhome_competition': '126', 'sqhome_competitionidinfeed': '22', 'sqhome_competitionteam': '0', 'sqhome_seasonid': '2014', 'timeZone': '2.0'}, callback=self.parse)
def parse_match_link(self, response):
item = SoccerItem()
item['TeamH'] = response.xpath("//div[@id='team1']/ul[@class='team-lineup']/li[@id='team1-select']/text()").extract()
item['GoalH'] = response.xpath("//div[@id='sq-mc-top-header']/div[@id='mc-header-team-1']/div[@class='team-score'][1]/span/text()").extract()
item['ShotsH'] = response.xpath("//div[@id='mc-stat-shot']/div/div[@class='team1-data']/text()").extract()
item['TeamA'] = response.xpath("//div[@id='team2']/ul[@class='team-lineup']/li[@id='team2-select']/text()").extract()
item['GoalA'] = response.xpath("//div[@id='sq-mc-top-header']/div[@id='mc-header-team-2']/div[@class='team-score'][1]/span/text()").extract()
item['ShotsA'] = response.xpath("//div[@id='mc-stat-shot']/div/div[@class='team2-data']/text()").extract()
item['Date'] = response.xpath("//div[@id='sq-mc-top-header']/div[@id='mc-header-date']/text()").extract()
item['Stadium'] = response.xpath("//div[@id='sq-mc-top-header']/div[@id='mc-header-stadium']/text()").extract()
yield item
#self.driver.get(response.url)
【问题讨论】:
标签: javascript selenium dynamic request scrapy