【发布时间】:2019-10-25 14:30:43
【问题描述】:
我启动了一个代码来抓取 Santander 网站。
抓取似乎有效,除了我得到错误的结果。而当我连续运行两次代码时,结果会发生变化。
如何让抓取更健壮,问题是当我运行代码并一一检查结果时,它似乎运行良好。
def hw_santander_scrape(Amount, Duration):
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('window-size=10000x5000')
webdriver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
#
import time
maintenant = DT.now()
period = str(maintenant.day) + '_' + str(maintenant.month) + '_' + str(maintenant.year)
print('Start Scraping')
################################################ Santander###############################################
Santander = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
project = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
url = 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0'
webdriver.get(url)
Max_amount = 90.000
Min_amount = 3.000
for i in range(len(Amount)):
Simulated_amount = Amount[i]
if Simulated_amount > Max_amount:
pass
elif Simulated_amount < Min_amount:
pass
else :
amount = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#amount")))
amount.clear()
amount.send_keys("{:.3f}".format(Simulated_amount))
WebDriverWait(webdriver, 30).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
for j in range(len(Duration)):
Simulated_duration = Duration[j]
Simulated_duration = round(int(Simulated_duration))
Max_duration = 96
Min_duration = 12
if Simulated_duration > Max_duration:
pass
elif Simulated_duration < Min_duration:
pass
else :
term = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#term")))
term.clear()
term.send_keys("{}".format(Simulated_duration))
term.send_keys(Keys.TAB)
webdriver.save_screenshot('screenshot_santander.png')
project.loc[j, 'Project'] = "reforma vivienda"
project.loc[j, 'Initial amount'] = float("{:.3f}".format(Amount[i]).replace('.', ''))
project.loc[j, 'Duration'] = Simulated_duration
project.loc[j, 'Period'] = str(maintenant.day) + '/' + str(maintenant.month) + '/' + str(maintenant.year)
project.loc[j, 'Monthly repayment'] = webdriver.find_element_by_css_selector('.r1 span').text.replace(' €', '').replace(',', '.')
project.loc[j, 'TIN'] = float(webdriver.find_element_by_css_selector('.r3 span').text[6: 10].replace(',', '.'))
project.loc[j, 'TAE'] = float(webdriver.find_element_by_css_selector('.r3 span').text[13: 17].replace(',', '.'))
project.loc[j, 'Total repayment'] = float(webdriver.find_element_by_css_selector('.r7 span').text.replace(' €', '').replace('.', '').replace(',', '.'))
Santander = Santander.append(project)
Santander = Santander.loc[Santander.TIN != 0,: ]
Santander.to_csv('Santander_{}.csv'.format(period), index = False)
print('End Scraping')
运行代码:
Amount = [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
Duration = [12, 15, 24, 36, 48, 60, 72, 84, 96]
hw_santander_scrape(Amount, Duration)
【问题讨论】:
标签: python selenium-webdriver web-scraping beautifulsoup selenium-chromedriver