selenium_51job_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:51job_com.py # Author:LGSP_Harold import pymongo from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import time class HandleWebdriver: def __init__(self): # 设置无头模式 options = Options() options.add_argument(\'--headless\') self.browser = webdriver.Firefox(firefox_options=options) # self.browser.maximize_window() def handle_job(self): # 打开目的地址 self.browser.get( \'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=\') # 通过WebDriverWait进行显式等待,等待搜索框 if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.ID, \'keywordInput\'))): # 外部获取输入岗位信息 input_keyword = input(\'请输入要查找的岗位:\') # 将要查找的信息发送到搜索框 self.browser.find_element_by_id(\'keywordInput\').send_keys(input_keyword) # 点击搜索 self.browser.find_element_by_id(\'search_btn\').click() if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.CLASS_NAME, \'j_joblist\'))): # 查看网页源代码 # print(self.browser.page_source) while True: time.sleep(2) self.handle_parse(self.browser.page_source) try: if self.browser.find_element_by_xpath(\'//li[@class="next"]/a\'): self.browser.find_element_by_xpath(\'//li[@class="next"]/a\').click() except: break self.browser.quit() def handle_parse(self, page_source): html_obj = etree.HTML(page_source) items = html_obj.xpath(\'//div[@class="j_joblist"]/div[@class="e"]\') data_list = [] for item in items: data = {} data[\'job_name\'] = item.xpath(\'.//a/p[@class="t"]/span[@class="jname at"]/text()\')[0] data[\'time\'] = item.xpath(\'.//a/p[@class="t"]/span[@class="time"]/text()\')[0] try: data[\'money\'] = item.xpath(\'.//a/p[@class="info"]/span[@class="sal"]/text()\')[0] except: data[\'money\'] = \'面议\' data[\'address\'] = item.xpath(\'.//a/p[@class="info"]/span[@class="d at"]/text()\')[0] try: tags = item.xpath(\'.//a/p[@class="tags"]/span/i/text()\') text = \'\' for tag in tags: text += tag + \' | \' data[\'tags\'] = text except: data[\'tags\'] = \'暂无\' data_list.append(data) # print(data_list) self.handle_mongodb(data_list) def handle_mongodb(self, data_list): client = pymongo.MongoClient(\'mongodb://admin:admin@127.0.0.1:27017\') db = client[\'db_51job_com\'] collections = db[\'collections_51job\'] collections.insert_many(data_list) selenium = HandleWebdriver() selenium.handle_job()