【问题标题】:How to parse data after specific text Python Selenium bs4如何在特定文本Python Selenium bs4之后解析数据
【发布时间】:2022-08-02 21:27:02
【问题描述】:

在我正在为其编写解析器的网站之一上,我遇到了以下问题: 我需要从表中获取所有数据,但它们没有在 html 代码中签名并被交换 html example

该表如下所示: table

一开始我是用XPATH来做这个的,但是解析的时候发现有些数据被交换了,比如引擎和注册号,或者根本没有交换。所以XPATH是不适合的,因为带有里程的数据可以在csv文件中进入到引擎的行中

是否有可能以某种方式在 selenium 中或通过 bs4 首先搜索一个单词,然后解析它之后的数据?

也就是在html代码中会找到Engine这个词,然后取下面的数据 html text that I need

我的代码:

import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth

def collect_data():
    global driver
    options = webdriver.ChromeOptions()
    options.set_preference(\'general.useragent.override\',
                           \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \'
                           \'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 \'
                           \'Safari/537.36\')
    options.add_experimental_option(\"excludeSwitches\", [\"enable-automation\"])
    options.add_experimental_option(\'useAutomationExtension\', False)
    # Background mode
    # options.add_argument(\'headless\')

    try:
        driver = webdriver.Chrome(options=options)
        stealth(driver,
                languages=[\"en-US\", \"en\"],
                vendor=\"Google Inc.\",
                platform=\"Win32\",
                webgl_vendor=\"Intel Inc.\",
                renderer=\"Intel Iris OpenGL Engine\",
                fix_hairline=True,
                )

driver.get(
            url=\'https://www.nettiauto.com/en/ford/mustang?yfrom=1980\'
        )
        time.sleep(10)
        \'\'\'Collect all URLs\'\'\'
        soup = BeautifulSoup(driver.page_source, \'lxml\')
        car_url_list = []
        total_page = soup.find(\'span\', class_=\'totPage\').text
        print(\'Ford Mustang\')
        print(f\'Total pages: {total_page}\')
        print(f\'Page 1 of {total_page} URL collected\')
        r = (int(total_page) + 1)
        count = 1
        for i in range(1, r, 1):
            driver.get(
                url=f\'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}\'
            )
            driver.implicitly_wait(10)
            soup = BeautifulSoup(driver.page_source, \'lxml\')
            car_cards = soup.find_all(\'a\', class_=\'tricky_link\')
            count += 1
            print(f\'Page {count} of {total_page} URL collected\')
            for car_ulr in car_cards:
                car_ulr = car_ulr.get(\'href\')
                car_url_list.append(car_ulr)
            with open(\'ford_mustang_url.txt\', \'w\', encoding=\'utf8\') as file:
                for line in car_url_list:
                    file.write(f\'{line}\\n\')
        count = 0
        row = []

        \'\'\'Collect car\'s data\'\'\'

        with open(\'ford_mustang_url.txt\', encoding=\'utf8\') as f:

            r = len(car_url_list)
            print(\'Total cars: \' + str(r))
            for i in range(r):

                driver.get(f.readline())
                driver.implicitly_wait(30)
                soup = BeautifulSoup(driver.page_source, \'lxml\')
                count += 1



                \'\'\'Car Data\'\'\'
                car_name = soup.find(\'title\').text.replace(\'Nettiauto\', \'\').replace(\'-\', \'\').replace(\'Used vehicle\', \'\').replace(\'Vaihtoauto\', \'\').replace(\'  \', \' \').strip()
                car_price = soup.find(\'span\', class_=\'GAPrice\').find(\'span\').text
                car_year = soup.find(\'div\', class_=\'mid_border\').get(\'data-year\')
                car_mileage = soup.find(\'div\', class_=\'mid_border\').get(\'data-mileage\')
                car_reg_number = soup.find(\'div\', class_=\'rekkari-banner__body_input\').text.strip()
                car_url = soup.find(\'link\', hreflang=\'en\').get(\'href\')
                # car_engine

                \'\'\'If section\'\'\'
                if car_reg_number == \'ABC-123\':
                    car_reg_number = None

                if car_mileage == \'100000000\':
                    car_mileage = None

                print(f\'{count}. \' + car_name)
                print(\'Price: \' + f\'{car_price}\')
                print(\'Year: \' + f\'{car_year}\')
                print(\'Mileage: \' + f\'{car_mileage}\')
                print(\'Reg.Number: \' + f\'{car_reg_number}\')
                print(\'URL: \' + f\'{car_url}\\n\')

                data = {
                    \'Name\': car_name,
                    \'Price\': car_price,
                    \'Year\': car_year,
                    \'Mileage\': car_mileage,
                    \'Reg.Number\': car_reg_number,
                    \'URL\': car_url,
                }
                row.append(data)

            csv_title = [\'Name\', \'Price\', \'Year\', \'Mileage\', \'Reg.Number\', \'URL\']
        with open(\'ford_mustang.csv\', \'w\', encoding=\'utf8\', newline=\'\') as f:
            writer = csv.DictWriter(f, fieldnames=csv_title)
            writer.writeheader()
            writer.writerows(row)

    except Exception as ex:
        print(ex)
    finally:
        driver.close()
        driver.quit()


def main():
    collect_data()


if __name__ == \'__main__\':
    main()
  • 发布一个最小的可复制示例,任何想要帮助您的人都可以复制粘贴。请不要张贴图片。发布您的代码,展示您到目前为止所尝试的内容。
  • @platipus_on_fire,我问是否有任何方法可以搜索 html 代码中的特定文本(不是通过链接、标签名称、类等)。不适用于现成的代码我已经拥有的代码在这里没有帮助,它只是收集我已经编写过的所有 URL 和这些 URL 中的一些数据,我尝试使用 XPATH,但表在不同的汽车中交换,它不适合。那么,为什么我需要将任何代码与不适合的方法或与问题无关的部分放在一起?图片用于显示网站上的表格和一些细节。
  • 因为任何愿意花时间和精力免费帮助您的人都应该能够轻松地重新创建您的上下文。见stackoverflow.com/help/how-to-ask
  • 请提供足够的代码,以便其他人可以更好地理解或重现该问题。

标签: python selenium parsing beautifulsoup


【解决方案1】:

这是针对您的问题的解决方案,而不是基于 selenium(它不是这项工作的正确工具),它将生成一个包含您所追求的所有详细信息的数据框/csv:

import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

scraper = cloudscraper.create_scraper()

big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
    r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
    soup = BeautifulSoup(r.text, 'html.parser')
    car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
    for link in car_links:
        urls_list.append(link)
for url in tqdm(urls_list):
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    dfs = pd.read_html(str(r.text))
    df_list = []
    title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
    subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
    df_list.append(('make_model', title))
    df_list.append(('variant', subtitle))
    for i, row in dfs[0].iterrows():
        df_list.append((row[0], row[1]))
        df_list.append((row[3], row[4]))
    correct_df = pd.DataFrame(df_list).T
    new_header = correct_df.iloc[0]
    correct_df = correct_df[1:]
    correct_df.columns = new_header
    big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')

一些注意事项:前 2 辆车的描述是芬兰语,其余的是英语,所以结尾的 df/csv 会有点有趣,但数据会在那里。此外,您可能会在终端中收到一些关于 pd append/use concat 的警告,但这些只是警告,程序将运行。

您可以使用pip install cloudscraper 安装cloudscraper,使用pip install tqdm 安装tqdm。当然,如果你热衷于使用 Selenium,你可以对从 selenium 获得的 html 应用相同的方法。

【讨论】:

  • 谢谢你的回答,但我是特定的 csv 文件,所以它对我不起作用。我想我在下面的答案中找到了解决方案。
【解决方案2】:

我通过使用 if else 找到了一些硒解决方案:

car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text

 if car_engine == 'Engine':
     car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
 else:
      car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]

对于驱动器类型它不起作用,所以我这样做了......

drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
    if drive_type == 'Drive type':
        drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
    else:
        drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
        if drive_type == 'Drive type':
            drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
        else:
            drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
            if drive_type == 'Drive type':
                drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
            else:
                drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
                if drive_type == 'Drive type':
                    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
                else:
                    pass

【讨论】:

    猜你喜欢
    • 2014-12-16
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2012-02-05
    • 2018-12-03
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多