拉勾网爬虫Python代码实现

爬虫目的：爬取拉勾网上数据分析岗的数据，以便分析当前数据分析岗的需求现状。

爬虫实现的功能：根据城市名称、岗位名称关键字，即可爬取拉勾网上所需的数据信息。

爬虫的主要模块：

　　主函数与信息存储模块main.py

　　网页下载模块https.py

　　网页解析模块parse.py

　　IP代理池setting.py

# main.py
\'\'\'
拉钩网对于同一ip的大量请求行为肯定会进行封禁，所以需要准备代理池。
为了实现高自动化，需要对一系列可能出现的异常情况进行处理，断点处理，确保程序不挂。
为了提高效率，加入多线程。
数据持久化，在持久化之前需要先进行清洗。
\'\'\'
import https,parse,setting
# import https.Http , parse.Parse , setting.headers ,setting.cookies

import time
import logging
import codecs

logging.basicConfig(level=logging.ERROR,
                    format=\'%(asctime)s Process%(process)d:%(thread)d %(message)s\',
                    datefmt=\'%Y-%m-%d %H:%M:%S\',
                    filename=\'diary.log\',
                    filemode=\'a\')

def process(value):
    # 处理字符串保存为csv文件时，因双引号、逗号引起的分隔问题。

    if ("\"" in value):
    # 若发现有双引号,将双引号替换为单引号。

        value = value.replace("\"", "\\'")
        # value = value.replaceAll("\"", "\"\"")
        # value = "\"" + value + "\""

    if ("," in value or "，" in value):
        # 若发现有逗号  需给整体前后加双引号
        value = "\"" + value + "\""

    return value

def getInfo(url, para):
    """
    获取信息
    """
    generalHttp = https.Http()
    htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
    generalParse = parse.Parse(htmlCode)
    pageCount = generalParse.parsePage()
    info = []
    for i in range(1, pageCount + 1):
        print(\'第%s页\' % i)
        para[\'pn\'] = str(i)
        htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
        generalParse = parse.Parse(htmlCode)
        info = info + getInfoDetail(generalParse)
        time.sleep(2)
    return info


def getInfoDetail(generalParse):
    """
    信息解析
    """
    info = generalParse.parseInfo()
    return info


def processInfo(info, para):
    """
    信息存储
    """
    logging.error(\'Process start\')
    try:
        title = \'companyName,positionType,positionName,companyStage,companySize,Education,WorkYear,Salary,\' \
                \'district,latitude,longitude,companyType,positionLables,positionAdvantage,companyLabel\n\'
        # "gbk"下csv文件不能对\t换单元格，只能换为“,”

        file = codecs.open(\'%s%s职位.csv\' %(para[\'city\'],para[\'kd\']), \'w\', \'gbk\')
        # encoding = \'utf-8\'时出现乱码。故改为"gbk",或使用‘utf-8’将文件在notepad++打开转回格式‘utf-8’后再打开。

        file.write(title)
        for p in info:
            line = str(p[\'companyName\']) + \',\' + str(p[\'positionType\']) + \',\' + str(p[\'positionName\']) + \',\' + \
                   str(p[\'companyStage\']) + \',\' + str(p[\'companySize\']) + \',\' +str(p[\'positionEducation\']) + \',\' + \
                   str(p[\'positionWorkYear\']) + \',\' + str(p[\'positionSalary\']) + \',\' +str(p[\'district\']) + \',\' +\
                   str(p[\'latitude\'])+ \',\' +str(p[\'longitude\'])+ \',\' + str(p[\'companyType\']) + \',\' + \
                   process(str(p[\'positionLables\']))+ \',\' + process(str(p[\'positionAdvantage\']))+ \',\' + \
                   process(str(p[\'companyLabel\'])) +\'\n\'
            # "gbk"下csv文件不能对\t换单元格，只能换为“,”
            file.write(line)
        file.close()
        return True
    except Exception as e:
        print(e)
        return None


def main(url, para):
    """
    主函数逻辑
    """
    logging.error(\'Main start\') # 日志生成
    if url:
        info = getInfo(url, para)  # 获取信息
        flag = processInfo(info, para)  # 信息储存
        return flag
    else:
        return None


if __name__ == \'__main__\':
    kdList = [u\'数据分析\']  # keyword即搜索关键字
    cityList = [u\'上海\']
    url = \'https://www.lagou.com/jobs/positionAjax.json\'    # 如何确定的？
    for keyword in kdList:
        for city in cityList:
            print(\'爬取%s\' % city)
            para = {\'first\': \'true\', \'pn\': \'1\', \'kd\': keyword, \'city\': city}
            flag = main(url, para)
            if flag:
                print(\'%s爬取成功\' % city)
            else:
                print(\'%s爬取失败\' % city)

# https.py
import setting
import requests, random
import logging

logging.basicConfig(level=logging.ERROR,
                    format=\'%(asctime)s Process%(process)d:%(thread)d %(message)s\',
                    datefmt=\'%Y-%m-%d %H:%M:%S\',
                    filename=\'diary.log\',
                    filemode=\'a\')

class Http:
    \'\'\'
    http请求相关的操作
    \'\'\'

    def __init__(self):
        pass

    def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
        \'\'\'
        获取网页源码
        url: 网页链接
        headers: headers
        cookies: cookies
        proxy: 代理
        timeOut: 请求超时时间
        timeOutRetry: 超时重试次数
        return: 源码
        \'\'\'
        if not url:
            logging.error(\'GetError url not exit\')
            return \'None\'
        logging.error(\'Get %s\' % url)
        try:
            if not headers: headers = {\'User-Agent\': setting.UA[random.randint(0, len(setting.UA) - 1)]}
            # if not proxy: proxy = {\'http\':"http://"+IP[random.randint(0, len(IP)-1)]}
            response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
            if response.status_code == 200 or response.status_code == 302:
                htmlCode = response.text
            else:
                htmlCode = \'None\'
            logging.error(\'Get %s %s\' % (str(response.status_code), url))
        except Exception as e:
            logging.error(\'GetExcept %s\' % str(e))
            if timeOutRetry > 0:
                htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry - 1))
            else:
                logging.error(\'GetTimeOut %s\' % url)
                htmlCode = \'None\'
        return htmlCode

    def post(self, url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
        \'\'\'
        post获取响应
        url: 目标链接
        para: 参数
        headers: headers
        cookies: cookies
        proxy: 代理
        timeOut: 请求超时时间
        timeOutRetry: 超时重试次数
        return: 响应
        \'\'\'
        if not url or not para:
            logging.error(\'PostError url or para not exit\')
            return None
        logging.error(\'Post %s\' % url)
        try:
            if not headers:
                headers = {\'User-Agent\': \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\'}
            response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
            if response.status_code == 200 or response.status_code == 302:
                htmlCode = response.text
            else:
                htmlCode = None
            logging.error(\'Post %s %s\' % (str(response.status_code), url))
        except Exception as e:
            logging.error(\'PostExcept %s\' % str(e))
            if timeOutRetry > 0:
                htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry - 1))
            else:
                logging.error(\'PostTimeOut %s\' % url)
                htmlCode = None
        return htmlCode

    def confirm(self, htmlCode, url, headers, cookies, proxy, catch_retry=5):
        \'\'\'
        反爬，验证页面
        htmlCode:网页源码
        return:网页源码
        \'\'\'
        # 获取网页title判断是否被ban
        return htmlCode

    def urlprocess(self, items):
        # +    URL 中+号表示空格               %2B
        # 空格 URL中的空格可以用+号或者编码    %20
        # /    分隔目录和子目录                %2F
        # ?    分隔实际的URL和参数             %3F
        # %    指定特殊字符                    %25
        # #    表示书签                        %23
        # &    URL 中指定的参数间的分隔符      %26
        # =    URL 中指定参数的值              %3D
        content = items.replace(\'&#047;\', \'%2F\').replace(\'&#061;\', \'%3D\').replace(\'+\', \'%2B\').replace( \
            \' \', \'%20\').replace(\'/\', \'%2F\').replace(\'?\', \'%3F\').replace(\'=\', \'%3D\')
        return content

# parse.py
import re
import demjson

class Parse:
    \'\'\'
    解析网页信息
    \'\'\'

    def __init__(self, htmlCode):
        self.htmlCode = htmlCode
        self.json = demjson.decode(htmlCode)
        pass

    def parseTool(self, content):
        \'\'\'
        清除html标签
        \'\'\'
        if type(content) != str: return content
        sublist = [\'<p.*?>\', \'</p.*?>\', \'<b.*?>\', \'</b.*?>\', \'<div.*?>\', \'</div.*?>\',
                   \'</br>\', \'<br />\', \'<ul>\', \'</ul>\', \'<li>\', \'</li>\', \'<strong>\',
                   \'</strong>\', \'<table.*?>\', \'<tr.*?>\', \'</tr>\', \'<td.*?>\', \'</td>\',
                   \'\r\', \'\n\', \'&.*?;\', \'&\', \'#.*?;\', \'<em>\', \'</em>\']
        try:
            for substring in [re.compile(string, re.S) for string in sublist]:
                content = re.sub(substring, "", content).strip()
        except:
            raise Exception(\'Error \' + str(substring.pattern))
        return content

    def parsePage(self):
        \'\'\'
        解析并计算页面数量
        return: 页面数量
        \'\'\'
        totalCount = self.json[\'content\'][\'positionResult\'][\'totalCount\']  # 职位总数量
        resultSize = self.json[\'content\'][\'positionResult\'][\'resultSize\']  # 每一页显示的数量
        pageCount = int(totalCount) // int(resultSize) + 1  # 页面数量
        return pageCount

    def parseInfo(self):
        \'\'\'
        解析信息
        \'\'\'
        info = []
        for position in self.json[\'content\'][\'positionResult\'][\'result\']:
            i = {}
            i[\'companyName\'] = position[\'companyFullName\']
            i[\'positionType\'] = position[\'firstType\']
            i[\'positionName\'] = position[\'positionName\']
            i[\'companyStage\'] = position[\'financeStage\']
            i[\'companySize\'] = position[\'companySize\']
            i[\'positionEducation\'] = position[\'education\']
            i[\'positionWorkYear\'] = position[\'workYear\']
            i[\'positionSalary\'] = position[\'salary\']
            i[\'district\'] = position[\'district\']
            i[\'latitude\'] = position[\'latitude\']
            i[\'longitude\'] = position[\'longitude\']
            i[\'companyType\'] = position[\'industryField\']

            i[\'positionLables\'] = position[\'positionLables\']
            i[\'companyLabel\'] = position[\'companyLabelList\']
            i[\'positionAdvantage\'] = position[\'positionAdvantage\']
            info.append(i)
        return info

# setting.py

headers = {
    \'Host\': \'www.lagou.com\',
    \'Connection\': \'keep-alive\',
    \'Content-Length\': \'23\',
    \'Origin\': \'https://www.lagou.com\',
    \'X-Anit-Forge-Code\': \'0\',
    \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\',
    \'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\',
    \'Accept\': \'application/json, text/javascript, */*; q=0.01\',
    \'X-Requested-With\': \'XMLHttpRequest\',
    \'X-Anit-Forge-Token\': \'None\',
    \'Referer\': \'https://www.lagou.com/jobs/list_java?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=\',
    \'Accept-Encoding\': \'gzip, deflate, br\',
    \'Accept-Language\': \'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7\'
}

# cookies
cookies = {
    \'user_trace_token\': \'20171011085044-36afc724-ae1e-11e7-947d-5254005c3644\',
    \'LGUID\': \'20171011085044-36afc9e6-ae1e-11e7-947d-5254005c3644\',
    \'_ga\': \'GA1.2.1411877279.1507683044\',
    \'index_location_city\': \'%E5%B9%BF%E5%B7%9E\',
    \'JSESSIONID\': \'ABAAABAAADEAAFI2466B2149D4B3E406932CAEA37FDF471\',
    \'_gid\': \'GA1.2.1604143331.1517585155\',
    \'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6\': \'1515000882,1515252738,1516984463,1517585156\',
    \'LGSID\': \'20180202232556-5ce93c91-082d-11e8-abfa-5254005c3644\', \'PRE_UTM\': \'\',
    \'PRE_HOST\': \'\',
    \'PRE_SITE\': \'\',
    \'PRE_LAND\': \'https%3A%2F%2Fwww.lagou.com%2F\',
    \'TG-TRACK-CODE\': \'index_navigation\',
    \'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6\': \'1517585322\',
    \'LGRID\': \'20180202232842-c0095589-082d-11e8-abfa-5254005c3644\',
    \'SEARCH_ID\': \'0a887843a48a49c7bb6dae915dabdcc1\'
}

# IP池
# 0(pay) or 1(free) or 2(None)
TAGIP = 0

# IP
IP = []

# UA
UA = [\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3\',
      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\
       Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))\',

      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)\',

      \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)\',

      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)\',

      \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)\',
      \'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6\',
      \'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13\',
      \'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13\',
      \'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)\',
      \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\',
      \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\',
      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1\',
      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)\',

      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)\',

      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)\',
      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)\',

      \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \
      (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\',

      \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1\',
      \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\',

      \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \
      Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11\',

      \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
      Chrome/21.0.1180.71 Safari/537.1 LBBROWSER\',

      \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \
      .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) \',

      \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \
      .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)\',

      \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)\',
      \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) \',
      \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1\',

      \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \
      Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0\',

      \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0\',

      \'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \
      Version/5.0.2 Mobile/8C148 Safari/6533.18.5\',

      \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre\',
      \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\',
      \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)\']