【问题标题】:How to use scrapy with selenium to scrape infinite scroll pages?如何使用带有 selenium 的 scrapy 来抓取无限滚动页面?
【发布时间】:2021-12-10 15:05:39
【问题描述】:

我正在尝试使用 scrapy +slenium 来抓取一个在我们向下滚动时动态加载数据的网页。我尝试了下面的代码,但我没有得到页面源,我陷入了一个循环。

import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
    name = 'manuals'
    
    start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
    
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path="D:\Desktop\work\manual_repeater\geckodriver.exe")

    def parse(self, response):
            
        self.driver.get(response.url)
        time.sleep(5)
        prev_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while True:
            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            time.sleep(5)
            new_height =self.driver.execute_script("return document.body.scrollHeight")
            
            if new_height == prev_height:            
                break        
        scrapy_selector = Selector(text = self.driver.page_source) 
        
        yield{"name":scrapy_selector}      

 
        

【问题讨论】:

    标签: python selenium web-scraping scrapy


    【解决方案1】:

    您需要在while 循环内更新prev_height

    import scrapy
    from bs4 import BeautifulSoup
    import re
    from ..items import ManualScrapingItem
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.keys import Keys
    import time
    from scrapy.selector import Selector
    class MyspiderSpider(scrapy.Spider):
        name = 'manuals'
        
        start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
        
        def __init__(self):
            self.driver = webdriver.Firefox(executable_path="D:\Desktop\work\manual_repeater\geckodriver.exe")
    
        def parse(self, response):
                
            self.driver.get(response.url)
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight") # initialize new_neight first
            
            while True:
                self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                time.sleep(5)
    
                prev_height = new_height #update prev_height
                new_height =self.driver.execute_script("return document.body.scrollHeight")
                
                if new_height == prev_height:            
                    break        
            scrapy_selector = Selector(text = self.driver.page_source) 
            
            yield{"name":scrapy_selector}    
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2019-10-27
      • 1970-01-01
      • 2023-03-08
      • 2020-02-18
      • 2018-04-16
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多