【发布时间】:2020-10-27 07:07:07
【问题描述】:
我在 selenium 中有返回码。它工作正常。它报废门户并提取表中的数据。但现在我正试图转向scrapy或requests。 我尝试学习两者,但失败了。硒结构符合我的想法。我需要很长时间才能理解 requests 或 scrappy 的基础知识然后使用它们。捷径是获得一些关于如何直接与当前代码相关的提示。
我为什么要换班? - 我发布了代码以寻求重构代码的建议 (here)。其中两个 cmets 建议我转向请求。这引发了努力。然后经过一些初步搜索,我意识到,我可以避免使用 selenium,而 requests 或 scrappy 可以为我节省大量时间。
我检查了here。但这并不能解决我的问题。
有人可以帮忙吗?提前致谢。
代码(包括 URL)-
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records @ {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load @ {name}")
time_counter += 1
else:
logger.info(f"the table did not load @ {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()
【问题讨论】:
标签: python python-3.x selenium web-scraping scrapy