如何使用带有 selenium 的 scrapy 来抓取无限滚动页面？答案

【问题标题】：How to use scrapy with selenium to scrape infinite scroll pages?如何使用带有 selenium 的 scrapy 来抓取无限滚动页面？
【发布时间】：2021-12-10 15:05:39
【问题描述】：

我正在尝试使用 scrapy +slenium 来抓取一个在我们向下滚动时动态加载数据的网页。我尝试了下面的代码，但我没有得到页面源，我陷入了一个循环。

import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
    name = 'manuals'
    
    start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
    
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path="D:\Desktop\work\manual_repeater\geckodriver.exe")

    def parse(self, response):
            
        self.driver.get(response.url)
        time.sleep(5)
        prev_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while True:
            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            time.sleep(5)
            new_height =self.driver.execute_script("return document.body.scrollHeight")
            
            if new_height == prev_height:            
                break        
        scrapy_selector = Selector(text = self.driver.page_source) 
        
        yield{"name":scrapy_selector}

【问题讨论】：

标签： python selenium web-scraping scrapy

【解决方案1】：

您需要在while 循环内更新prev_height：

import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
    name = 'manuals'
    
    start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
    
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path="D:\Desktop\work\manual_repeater\geckodriver.exe")

    def parse(self, response):
            
        self.driver.get(response.url)
        time.sleep(5)
        new_height = driver.execute_script("return document.body.scrollHeight") # initialize new_neight first
        
        while True:
            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            time.sleep(5)

            prev_height = new_height #update prev_height
            new_height =self.driver.execute_script("return document.body.scrollHeight")
            
            if new_height == prev_height:            
                break        
        scrapy_selector = Selector(text = self.driver.page_source) 
        
        yield{"name":scrapy_selector}

【讨论】：