使用selenium爬取51Job职位信息入库mongoDB

selenium_51job_com.py
#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:51job_com.py
# Author:LGSP_Harold
import pymongo
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time


class HandleWebdriver:
    def __init__(self):
        # 设置无头模式
        options = Options()
        options.add_argument(\'--headless\')

        self.browser = webdriver.Firefox(firefox_options=options)
        # self.browser.maximize_window()

    def handle_job(self):
        # 打开目的地址
        self.browser.get(
            \'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=\')

        # 通过WebDriverWait进行显式等待，等待搜索框
        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.ID, \'keywordInput\'))):
            # 外部获取输入岗位信息
            input_keyword = input(\'请输入要查找的岗位：\')
            # 将要查找的信息发送到搜索框
            self.browser.find_element_by_id(\'keywordInput\').send_keys(input_keyword)
            # 点击搜索
            self.browser.find_element_by_id(\'search_btn\').click()

        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.CLASS_NAME, \'j_joblist\'))):
            # 查看网页源代码
            # print(self.browser.page_source)
            while True:
                time.sleep(2)
                self.handle_parse(self.browser.page_source)
                try:
                    if self.browser.find_element_by_xpath(\'//li[@class="next"]/a\'):
                        self.browser.find_element_by_xpath(\'//li[@class="next"]/a\').click()
                except:
                    break

            self.browser.quit()

    def handle_parse(self, page_source):
        html_obj = etree.HTML(page_source)
        items = html_obj.xpath(\'//div[@class="j_joblist"]/div[@class="e"]\')
        data_list = []
        for item in items:
            data = {}
            data[\'job_name\'] = item.xpath(\'.//a/p[@class="t"]/span[@class="jname at"]/text()\')[0]
            data[\'time\'] = item.xpath(\'.//a/p[@class="t"]/span[@class="time"]/text()\')[0]
            try:
                data[\'money\'] = item.xpath(\'.//a/p[@class="info"]/span[@class="sal"]/text()\')[0]
            except:
                data[\'money\'] = \'面议\'
            data[\'address\'] = item.xpath(\'.//a/p[@class="info"]/span[@class="d at"]/text()\')[0]
            try:
                tags = item.xpath(\'.//a/p[@class="tags"]/span/i/text()\')
                text = \'\'
                for tag in tags:
                    text += tag + \' | \'
                data[\'tags\'] = text
            except:
                data[\'tags\'] = \'暂无\'
            data_list.append(data)
        # print(data_list)
        self.handle_mongodb(data_list)

    def handle_mongodb(self, data_list):
        client = pymongo.MongoClient(\'mongodb://admin:admin@127.0.0.1:27017\')
        db = client[\'db_51job_com\']
        collections = db[\'collections_51job\']
        collections.insert_many(data_list)


selenium = HandleWebdriver()
selenium.handle_job()