【问题标题】:want to extract data from a site using selenium-wedriver想要使用 selenium-webdriver 从站点中提取数据
【发布时间】:2021-02-11 14:19:23
【问题描述】:

我想提取网站上每家公司的名称、网站、电话和电子邮件,但代码会一遍又一遍地打印页面上的第一个公司名称,如果我尝试查找网站、电话和电子邮件。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

url='https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b'
driver = webdriver.Firefox()
driver.get(url)
wait=WebDriverWait(driver,50)

wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,'#pym-0 > iframe')))
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
button = wait.until(EC.element_to_be_clickable((By.XPATH,'./html/body/div[5]/div/ul/li[13]/a')))



numOfPages=1161
counter=4

for i in range(numOfPages):

    driver.execute_script("arguments[0].scrollIntoView();", button)

    for e in list:
        name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
        print(name)
        website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
        print(website)
        phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
        print(phone)
        email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
        print(email)
 
    time.sleep(counter)  
    button.click()
    list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
    if i%40==0:
        counter+=1

我的问题在于这些代码行

list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
 for e in list:
        name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
        print(name)
        website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
        print(website)
        phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
        print(phone)
        email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
        print(email)

【问题讨论】:

  • 您正在搜索的数据在 iframe 中。您需要切换到 iframe,然后使用 find by
  • 我这里已经切换到iframe wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,'#pym-0 > iframe')))
  • 没有也没有看到 iframe 实施。 ...我看到最后的空间 (By.CLASS_NAME,'searched-list ')。尝试使用 By.CSSSELECTOR('searched-list.ng-scope')。让我知道那是他的全部
  • 我试了还是不行

标签: python selenium selenium-webdriver web-scraping webdriver


【解决方案1】:

您是否建议使用其他查找元素的功能来使您的代码更具可读性。我对您的代码进行了一些更改,希望对您获取数据有所帮助:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time


url = "https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b"
driver = webdriver.Firefox()
driver.get(url)
wait = WebDriverWait(driver, 50)

wait.until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))
).click()
wait.until(
    EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#pym-0 > iframe"))
)
list = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title ")))
button = wait.until(
    EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "a[ng-click='setPage(pager.currentPage + 1)']")
    )
)

counter = 4


def getText(element):
    text = element.text

    if not text:
        text = "---"

    return text


def getContactInfo(parent):
    element = None
    try:
        element = parent.find_element_by_class_name("contact-info")
    except:
        pass

    return element

while (
    # Last Page has disabled the li element
    not "disabled"
    in driver.find_element_by_css_selector(
        "li[ng-class='{disabled:pager.currentPage === pager.totalPages}']"
    )
    .get_attribute("class")
    .split()
):

    driver.execute_script("arguments[0].scrollIntoView();", button)

    for e in list:
        name = e.find_element_by_tag_name("h4")
        print(getText(name))
        account_info = e.find_element_by_css_selector(
            "div.account-Info.large-12.columns.ng-scope"
        )
        contact_info = getContactInfo(account_info)

        if contact_info:
            website = contact_info.find_element_by_css_selector(
                "a.website.ng-binding.ng-scope"
            )
            print(getText(website))
            phone = contact_info.find_element_by_css_selector("a.telephone.ng-binding")
            print(getText(phone))
            email = contact_info.find_element_by_css_selector("a.emailid.ng-binding")
            print(getText(email))

        print("*******\n")

    button.click()
    time.sleep(counter)
    list = wait.until(
        EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title "))
    )

driver.quit()

【讨论】:

  • 感谢您的帮助和编辑,我想知道是否有办法在没有电话、电子邮件或网站的公司上放置 (-) 符号或只是空白。
  • 当然,检查编辑,我添加了一个处理这些情况的函数
  • 很抱歉打扰您其他没有的,有电话号码
猜你喜欢
  • 1970-01-01
  • 2016-03-06
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2015-12-12
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多