【发布时间】:2020-02-20 23:40:54
【问题描述】:
我编写了一个脚本来抓取酒店的TripAdvisor 上的图像,并且我能够遍历所有这些图像,我关心的是是否知道我在弹出窗口中完成了滚动。我无法创建一个条件来打破我的循环,然后解析所有图像 url 并无限地停留在循环内。我的 if 条件应该是什么才能退出循环?非常感谢任何帮助!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
old_image_length = len(image_url)
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[@class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and add their url to list
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
new_image_length = len(image_url)
#move to next visible images in the array which is the last one
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#wait one second
time.sleep(1)
#if the first and last image in the arrays are the same for visibility then get out
if(old_image_length == new_image_length):
end = True
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])
#print(image_url)
【问题讨论】:
标签: python selenium selenium-webdriver web-scraping selenium-chromedriver