【问题标题】:The page doesn't scraping页面不抓取
【发布时间】:2020-07-14 23:57:49
【问题描述】:

我正在尝试抓取此页面

https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife

我刮了这个网站的第一页,然后用selenium点击到下一页,但我只能得到第一页的内容,当我第二页时,它与第一页的内容相同。我不知道如何解决这个问题,或者网页是否对抓取有一些保护。

有人可以帮我吗?

from bs4 import BeautifulSoup
import pandas as pd 
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver

def scrape():
    cont = [True,True,True,True,False]

    for times in cont:

        if times != True:
            driver = webdriver.Firefox(executable_path = 'geckodriver')
            
            page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
            
            sleep(15)

            titles = []
            addresses = []
            areas = []
            rooms = []
            bathes = []
            values = []

            start_time = time()
            request = 0
            
            soup = BeautifulSoup(page,'html.parser')
            imov = soup.find_all('div', class_='property-card__main-content')

            sleep(randint(8,15))

            # Monitor
            request += 1
            elapsed_time = time() - start_time
            print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
            clear_output(wait = True)

            # Throw a warning for non-200 status codes
            if page.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, page.status_code))

            # Break the loop if the number of requests is greater than expected
            if request > 72:
                warn('Number of requests was greater than expected.')
                break
                
            for container in imov:
                # Título
                title = container.h2.a.get_text()
                t2 = title.strip()
                titles.append(t2)
                # Título

                # Endereço
                address = container.h2.span.get_text()
                a2 = address.strip()
                addresses.append(a2)
                # Endereço

                # Área
                area = container.li.span.get_text()
                ar2 = area.strip()
                areas.append(ar2)
                # Área

                # Quartos
                room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
                room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
                r2 = room2.strip()
                rooms.append(r2)
                # Quartos

                # Banheiros
                bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
                bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
                b2 = bath2.strip()
                bathes.append(b2)
                # Banheiros

                # Valor
                value = container.section.div.get_text()
                v2 = value.strip()
                values.append(v2)
                # Valor

                # Dataframe e salvar
                vivareal = pd.DataFrame({
                    "title": titles, 
                    "address": addresses, 
                    "area": areas, 
                    "rooms":rooms,
                    "baths":bathes,
                    "value":values
                    })
                                
                vivareal.to_csv(r'output.csv')

            prox = driver.find_element_by_xpath('//*[@title="Próxima página"]')
            prox.click()
        else:
            print('Done!')
            
scrape()```

【问题讨论】:

  • 你不只是获得相同的页面吗?从page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife') 行开始,它并不表示任何页面递增
  • 正如 ewong 所说,您更改页面但返回相同的页面。
  • 你在研究伯南布哥的真实状态吗?这是一个很好的研究。我试图在圣保罗做同样的事情。你有 Github 页面吗?
  • 我用prox.click() 在最终代码处添加了一个点击命令以进入下一页,有什么问题吗? @ewong
  • 我没有。我还是编程领域的新手,然后我没有注册任何 Github 帐户@MarceloBaliu

标签: python selenium web-scraping


【解决方案1】:

虽然你把click命令放在了最后,但是当它进入下一个循环时,第一个命令是创建一个新的驱动程序,然后调用该命令将Viva Real的主页带到Pernambuco。这是不希望的。你可以这样做:

def scrape():
    cont = [True,True,True,True,False]

    # You create the driver and access the main page only once
    driver = webdriver.Firefox(executable_path = 'geckodriver')
    page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')

    for times in cont:

        if times != True:
            # Wait to load every page
            sleep(15)

【讨论】:

    【解决方案2】:

    即使使用@MarceloBaliu 提供的修复程序,您的代码也无法正常工作。这是我的代码(终于!)为我工作。我之所以分享,是因为它可以帮助别人,就像我被这个网站所帮助一样。

    from selenium import webdriver
    from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
    from selenium.webdriver.common.by import By 
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from bs4 import BeautifulSoup
    import re
    import time
    import pandas as pd
    
    class ScraperVivaReal:
        wait_time = 5
        
        def __init__(self, url):
            # Initializing the webdriver
            options = webdriver.FirefoxOptions()
            options.add_argument('-headless')
            self.driver = webdriver.Firefox(options=options)
            self.driver.maximize_window()
            self.driver.get(url)
            time.sleep(self.wait_time)
            # Handling cookies acception
            WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie-notifier-cta"]'))).click()
            time.sleep(self.wait_time/2)
            
        def __scrape_page__(self):
            result = []
            
            # Extracting data from the page
            try:
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            except WebDriverException:
                print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
                return result
            
            # Finding property cards containing search results
            div_list = soup.find_all('div', {'class':'property-card__content'})
            
            # Iterating each card
            for d in div_list:
    
                # Extracting info from card
                title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
                complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
                area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
                rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
                baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
                garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
                
                # Extracting the price
                try:
                    price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
                except AttributeError:
                    price = "N/I"
                
                # Splitting the address
                add_list = re.split(',|-', complete_address)
                add_list = [ item.strip() for item in add_list ]
                if len(add_list) == 2:
                    city, st = add_list
                    neibhood = 'N/I'
                    address = 'N/I'
                    number = 'N/I'
                if len(add_list) == 3:
                    neibhood, city, st = add_list
                    address = 'N/I'
                    number = 'N/I'
                if len(add_list) == 4:
                    address, neibhood, city, st = add_list
                    number = 'N/I'
                elif len(add_list) == 5:
                    address, number, neibhood, city, st = add_list
                    
                # Adding the result into a dicionary and appending the dict to a result list
                row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
                result.append(row)
            return result
            
        def __next_page__(self):
            # Finding the "Next Page" button element
            next_element = self.driver.find_element_by_xpath('//*[@title="Próxima página"]')
            try:
                # Trying to click it
                next_element.click()
                time.sleep(self.wait_time)
                return True
            # Treating some exceptions (element not found and element not clickable)
            except ElementClickInterceptedException:
                print('"Próxima Página" element is not clickable!')
            except NoSuchElementException:
                print('"Próxima Página" element not found!')
            return False
            
        def run(self, output):
            has_next = True
            final_result = []
            # Getting the information!
            while has_next:
                results = self.__scrape_page__()
                final_result.extend(results)
                print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
                if len(results) == 0:
                    break
                has_next = self.__next_page__()
            # Quitting Firefox
            self.driver.quit()
            # Exporting results to CSV
            df = pd.DataFrame(final_result)
            df.to_csv(output, sep=',')
    
    S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
    S.run('output.csv')
    

    【讨论】:

    • 兄弟,谢谢你的分享。但是我有一个关于此代码部分的问题WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie-notifier-cta"]'))).click() 我不太明白如何在页面中找到这部分,因为它与页面的 cookie 相关。处理这个的原因是什么?
    • 这个元素是页面的底部(它是“接受”按钮,接受Cookies)。如果您完全等待页面加载,您可以检查它并找到该元素。当我上完这门课时,页面挡住了导航按钮。如果我不接受 cookie,并且元素在初始加载时不可用,那么 WebDriverWait 基本上会阻止执行,直到该元素可用。在不接受 cookie 的情况下,Selenium 无法“单击”下一步按钮,算法被卡在搜索结果的第一页。希望澄清。
    • 如果您手动导航 VivaReal,您可以正常单击“下一步”按钮。但是 Selenium 在不接受 cookie 的情况下无法做到这一点。
    • 你非常清楚我,我的原始代码没有等待cookies,然后它没有运行。我怎么能在页面中看到这个 cookie 的 ID?我不知道在哪里搜索这样的页面中的cookie "id = cookie-notifier-cta"
    • 在 Google Chrome 中,您可以右键单击要检查的元素(本例中为“接受”按钮)并在上下文菜单中选择“检查”。检查选项卡打开,光标直接跳到所选元素。在 VivaReal 中,执行此结果:<button class="cookie-notifier__cta" id="cookie-notifier-cta">Entendi</button>。该元素位于名为“cookie-notifier”的 div 类中
    猜你喜欢
    • 1970-01-01
    • 2018-09-22
    • 1970-01-01
    • 2016-09-11
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多