【发布时间】:2016-09-20 18:28:36
【问题描述】:
我正在抓取一个严重依赖 Javascript 的网站。我需要从中提取要解析的 url 的主页取决于 Javascript,因此我必须修改 start_requests。 我正在寻找一种方法来连接 start_requests、linkextractor 和 process_match
class MatchSpider(CrawlSpider):
name = "match"
allowed_domains = ["whoscored"]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class, "match-report")]//@href'), callback='parse_item'),
)
def start_requests(self):
url = 'https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/Fixtures/England-Premier-League-2016-2017'
browser = Browser(browser='Chrome')
browser.get(url)
# should return a request with the html body from Selenium driver so that LinkExtractor rule can be applied
def process_match(self, response):
match_item = MatchItem()
regex = re.compile("matchCentreData = \{.*?\};", re.S)
match = re.search(regex, response.text).group()
match = match.replace('matchCentreData =', '').replace(';', '')
match_item['match'] = json.loads(match)
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
yield match_item
我在 Selenium 周围使用的包装器:
class Browser:
"""
selenium on steroids. allows you to create different types of browsers plus
adds methods for safer calls
"""
def __init__(self, browser='Firefox'):
"""
type: silent or not
browser: chrome of firefox
"""
self.browser = browser
self._start()
def _start(self):
'''
starts browser
'''
if self.browser == 'Chrome':
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_extension('./libcommon/adblockpluschrome-1.10.0.1526.crx')
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("user-agent={0}".format(random.choice(USER_AGENTS)))
self.driver_ = webdriver.Chrome(executable_path='./libcommon/chromedriver', chrome_options=chrome_options)
elif self.browser == 'Firefox':
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
profile.add_extension('./libcommon/adblock_plus-2.7.1-sm+tb+an+fx.xpi')
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("webdriver.load.strategy", "unstable")
self.driver_ = webdriver.Firefox(profile)
elif self.browser == 'PhantomJS':
self.driver_ = webdriver.PhantomJS()
self.driver_.set_window_size(1120, 550)
def close(self):
self.driver_.close()
def return_when(self, condition, locator):
"""
returns browser execution when condition is met
"""
for _ in range(5):
with suppress(Exception):
wait = WebDriverWait(self.driver_, timeout=100, poll_frequency=0.1)
wait.until(condition(locator))
self.driver_.execute_script("return window.stop")
return True
return False
def __getattr__(self, name):
"""
ruby-like method missing: derive methods not implemented to attribute that
holds selenium browser
"""
def _missing(*args, **kwargs):
return getattr(self.driver_, name)(*args, **kwargs)
return _missing
【问题讨论】:
-
你的问题是?您是否尝试从
browser.get(url)获取 HTML? -
不要认为它是重复的,但这可能会对您有所帮助:stackoverflow.com/questions/17975471/…
-
问题是,在我的情况下,从一开始就需要使用 Selenium 完成第一个请求。
-
你能把我链接到你正在使用的
Browser类吗?我在 Selenium 的文档中找不到任何关于它的信息。 -
它只是 Selenium 的一个包装器。我编辑了帖子。