【问题标题】:Is there a way to run unique instances of a class in threads in a web scraper有没有办法在网络爬虫的线程中运行一个类的唯一实例
【发布时间】:2021-11-29 15:52:23
【问题描述】:

我目前正在做一个项目,该项目需要我从cheapflights 中抓取数据。由于这是一个集体项目,我们决定从主页上最受欢迎的城市中抓取数据。

在我的例子中,它是一个带有城市字典的目的地列表:城市的 url 代码

例如

cities = ["Amsterdam", ... etc]

airport_codes = {
    "Amsterdam": "AMS",
}

每个目的地对应于网站上的一个新页面,所以我想我会使用多线程,因为抓取是 I/O 绑定的,下面是我的整个抓取器类,它需要重构和整理,因为到目前为止这只是一个 WIP(抱歉如果对于帖子来说代码块太大)

class FlightScraper:
    logging.basicConfig(format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', 
                        level=logging.INFO,
                        filename='flight_scraper_logs.txt')
    logging = logging.getLogger(__name__)
    
    def __init__(self) -> None:
        logging.info('Initializing Scraper')
        software_names = [SoftwareName.CHROME.value]
        operating_systems = [OperatingSystem.WINDOWS.value,
                             OperatingSystem.LINUX.value,
                             OperatingSystem.MAC_OS_X.value]
        user_agent_rotator = UserAgent(software_names=software_names,
                                       operating_systems=operating_systems,
                                       limit=100)
        user_agent = user_agent_rotator.get_random_user_agent()
        options = Options()
        options.add_argument('--disable-blink-features')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.headless = True
        options.add_argument(f'user-agent={user_agent}')
        self.driver = webdriver.Chrome(options=options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": f"{user_agent}"})
        self.driver.get("https://www.cheapflights.co.uk/flight-search/LHR-AMS/2021-12-02/2021-12-05?sort=bestflight_a")
        self.__bypass_cookies()
        self.city = ""

    def __bypass_cookies(self) -> None:
        try:
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, COOKIES_POPUP))
            )
            WebDriverWait(self.driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, COOKIES_POPUP))
            )
        except Exception as e:
            logging.error(f"{e}", exc_info=True)
        finally:
            cookie = self.driver.find_element(By.XPATH, COOKIES_POPUP)
            cookie.click()
    
    def change_url(self, depart_date: str, return_date: str, destination) -> None:
        curr_url = self.driver.current_url
        url_sections = curr_url.split('/')
        url_sections[-2] = depart_date
        logging.info(f'Depart date changed to {depart_date}')
        airports = url_sections[-3].split('-')
        airports[1] = destination
        url_sections[-3] = '-'.join(airports)
        logging.info(f'Destination changed to {destination}')
        return_section = url_sections[-1].split('?')
        return_section[0] = return_date
        logging.info(f'Return date changed to {return_date}')
        url_sections[-1] = '?'.join(return_section)
        new_url = '/'.join(url_sections)
        self.driver.get(new_url)
        self.city = destination
        return self.city
        
    def get_flight_info(self, info):
        flight = {}
        sleep(2)
        origin_container = info.find_elements(By.XPATH, FLIGHTS_MAIN)[0].text
        sleep(2)
        return_container = info.find_elements(By.XPATH, FLIGHTS_MAIN)[1].text
        sleep(2)
        airline = info.find_element(By.XPATH, AIRLINE).text
        flight['Origin-Flight'] = origin_container.split('\n')[0]
        flight['Origin-Airport'] = origin_container.split('\n')[1]
        flight['Airline'] = airline
        flight['Origin-Destination-Airport'] = origin_container.split('\n')[3]
        flight['Origin-Flight-Type'] = origin_container.split('\n')[4]
        flight['Origin-Flight-Duration'] = origin_container.split('\n')[5]
        flight['Return-Flight'] = return_container.split('\n')[0]
        flight['Return-Airport'] = return_container.split('\n')[1]
        flight['Return-Destination-Airport'] = return_container.split('\n')[3]
        flight['Return-Flight-Type'] = return_container.split('\n')[4]
        flight['Return-Flight-Duration'] = return_container.split('\n')[5]
        
        return flight
        
    def get_flight_info_driver(self) -> pd.DataFrame:
        try:
            logging.info('Getting information on flights')
            WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, FLIGHTS_CARD)))
        except Exception as e:
            logging.error(f'{e}', exc_info=True)
        else:
            sleep(5)
            self.driver.refresh()
            flights_info = self.driver.find_elements(By.XPATH, FLIGHTS_CARD)
            list_of_flight_dicts = []
            for info in tqdm(flights_info, desc="Flight info progress:", total=len(flights_info)):
                flight = self.get_flight_info(info)
                flights_df = pd.DataFrame([flight], columns=flight.keys())
                list_of_flight_dicts.append(flights_df)
            flights_df = pd.concat(list_of_flight_dicts)
            flights_df.to_csv(f'{os.getcwd()}/flights_information/{self.city}.csv', index=False)
            
            
    def scrape(self, depart_date, return_date, destination):
        self.change_url(depart_date, return_date, destination)
        self.get_flight_info_driver()

    
    def run(self):
        with ThreadPoolExecutor() as executor:
            scrapers = [
                executor.submit(
                    self.scrape, '2022-01-10', '2022-01-14', AIRPORT_CODES[city]) for city in tqdm(DESTINATIONS, desc='Scraper Progress', total=len(DESTINATIONS))
                                                                                                   ]
        for scraper in scrapers:
            scraper.result()
            
scraper = FlightScraper()
scraper.run()

为此,我立即得到陈旧的元素异常,我认为这是因为我只在该类的一个实例中运行它,因此由于它更改了同一个 chrome 实例的 url/页面,因此前一个 url 的 web 元素正在等待其他线程获取请求立即变得陈旧。

我想知道是否有办法让我的想法发挥作用?我最初的想法是在每个线程中以某种方式运行该类的唯一实例,并为每个实例运行具有唯一城市的刮擦方法,但我不知道如何将每个方法分配给刮板的唯一实例而不会重叠。

如果这是措辞不当或愚蠢的问题,非常抱歉。

谢谢

更新

我设法想出了一个解决方案,我在类之外有一个函数,它初始化类,然后是它的抓取方法,然后有一个单独的函数,它创建调用该函数的线程,每个线程都有一个唯一的城市,见下文:

def _run_scrape(city):
    scraper = FlightScraper(city)
    scraper.scrape('2022-01-10', '2022-01-14')
    

def run():
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(
                _run_scrape, AIRPORT_CODES[city]
            ) for city in tqdm(DESTINATIONS, desc="Scraper Progress", total=len(tqdm(DESTINATIONS)))]
        
    for scraper in futures:
        scraper.result()
run()

【问题讨论】:

  • 干得好找到解决方案,而不是更新你的问题,你应该自己回答。
  • 抱歉完全忘记了,谢谢提醒
  • 别着急,你也可以接受的:)

标签: python multithreading selenium web-scraping


【解决方案1】:

我在类之外创建了一个函数,它初始化类,然后是它的抓取方法,然后有一个单独的函数创建调用该函数的线程,每个线程都有一个唯一的城市

def _run_scrape(city):
    scraper = FlightScraper(city)
    scraper.scrape('2022-01-10', '2022-01-14')
    

def run():
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(
                _run_scrape, AIRPORT_CODES[city]
            ) for city in DESTINATIONS]
        
    for scraper in futures:
        scraper.result()
run()

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2011-08-15
    • 2016-07-18
    • 1970-01-01
    相关资源
    最近更新 更多