爬虫目的:爬取拉勾网上数据分析岗的数据,以便分析当前数据分析岗的需求现状。
爬虫实现的功能:根据城市名称、岗位名称关键字,即可爬取拉勾网上所需的数据信息。
爬虫的主要模块:
主函数与信息存储模块main.py
网页下载模块https.py
网页解析模块parse.py
IP代理池setting.py
# main.py
\'\'\' 拉钩网对于同一ip的大量请求行为肯定会进行封禁,所以需要准备代理池。 为了实现高自动化,需要对一系列可能出现的异常情况进行处理,断点处理,确保程序不挂。 为了提高效率,加入多线程。 数据持久化,在持久化之前需要先进行清洗。 \'\'\' import https,parse,setting # import https.Http , parse.Parse , setting.headers ,setting.cookies import time import logging import codecs logging.basicConfig(level=logging.ERROR, format=\'%(asctime)s Process%(process)d:%(thread)d %(message)s\', datefmt=\'%Y-%m-%d %H:%M:%S\', filename=\'diary.log\', filemode=\'a\') def process(value): # 处理字符串保存为csv文件时,因双引号、逗号引起的分隔问题。 if ("\"" in value): # 若发现有双引号,将双引号替换为单引号。 value = value.replace("\"", "\\'") # value = value.replaceAll("\"", "\"\"") # value = "\"" + value + "\"" if ("," in value or "," in value): # 若发现有逗号 需给整体前后加双引号 value = "\"" + value + "\"" return value def getInfo(url, para): """ 获取信息 """ generalHttp = https.Http() htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies) generalParse = parse.Parse(htmlCode) pageCount = generalParse.parsePage() info = [] for i in range(1, pageCount + 1): print(\'第%s页\' % i) para[\'pn\'] = str(i) htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies) generalParse = parse.Parse(htmlCode) info = info + getInfoDetail(generalParse) time.sleep(2) return info def getInfoDetail(generalParse): """ 信息解析 """ info = generalParse.parseInfo() return info def processInfo(info, para): """ 信息存储 """ logging.error(\'Process start\') try: title = \'companyName,positionType,positionName,companyStage,companySize,Education,WorkYear,Salary,\' \ \'district,latitude,longitude,companyType,positionLables,positionAdvantage,companyLabel\n\' # "gbk"下csv文件不能对\t换单元格,只能换为“,” file = codecs.open(\'%s%s职位.csv\' %(para[\'city\'],para[\'kd\']), \'w\', \'gbk\') # encoding = \'utf-8\'时出现乱码。故改为"gbk",或使用‘utf-8’将文件在notepad++打开转回格式‘utf-8’后再打开。 file.write(title) for p in info: line = str(p[\'companyName\']) + \',\' + str(p[\'positionType\']) + \',\' + str(p[\'positionName\']) + \',\' + \ str(p[\'companyStage\']) + \',\' + str(p[\'companySize\']) + \',\' +str(p[\'positionEducation\']) + \',\' + \ str(p[\'positionWorkYear\']) + \',\' + str(p[\'positionSalary\']) + \',\' +str(p[\'district\']) + \',\' +\ str(p[\'latitude\'])+ \',\' +str(p[\'longitude\'])+ \',\' + str(p[\'companyType\']) + \',\' + \ process(str(p[\'positionLables\']))+ \',\' + process(str(p[\'positionAdvantage\']))+ \',\' + \ process(str(p[\'companyLabel\'])) +\'\n\' # "gbk"下csv文件不能对\t换单元格,只能换为“,” file.write(line) file.close() return True except Exception as e: print(e) return None def main(url, para): """ 主函数逻辑 """ logging.error(\'Main start\') # 日志生成 if url: info = getInfo(url, para) # 获取信息 flag = processInfo(info, para) # 信息储存 return flag else: return None if __name__ == \'__main__\': kdList = [u\'数据分析\'] # keyword即搜索关键字 cityList = [u\'上海\'] url = \'https://www.lagou.com/jobs/positionAjax.json\' # 如何确定的? for keyword in kdList: for city in cityList: print(\'爬取%s\' % city) para = {\'first\': \'true\', \'pn\': \'1\', \'kd\': keyword, \'city\': city} flag = main(url, para) if flag: print(\'%s爬取成功\' % city) else: print(\'%s爬取失败\' % city)
# https.py import setting import requests, random import logging logging.basicConfig(level=logging.ERROR, format=\'%(asctime)s Process%(process)d:%(thread)d %(message)s\', datefmt=\'%Y-%m-%d %H:%M:%S\', filename=\'diary.log\', filemode=\'a\') class Http: \'\'\' http请求相关的操作 \'\'\' def __init__(self): pass def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5): \'\'\' 获取网页源码 url: 网页链接 headers: headers cookies: cookies proxy: 代理 timeOut: 请求超时时间 timeOutRetry: 超时重试次数 return: 源码 \'\'\' if not url: logging.error(\'GetError url not exit\') return \'None\' logging.error(\'Get %s\' % url) try: if not headers: headers = {\'User-Agent\': setting.UA[random.randint(0, len(setting.UA) - 1)]} # if not proxy: proxy = {\'http\':"http://"+IP[random.randint(0, len(IP)-1)]} response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut) if response.status_code == 200 or response.status_code == 302: htmlCode = response.text else: htmlCode = \'None\' logging.error(\'Get %s %s\' % (str(response.status_code), url)) except Exception as e: logging.error(\'GetExcept %s\' % str(e)) if timeOutRetry > 0: htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry - 1)) else: logging.error(\'GetTimeOut %s\' % url) htmlCode = \'None\' return htmlCode def post(self, url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5): \'\'\' post获取响应 url: 目标链接 para: 参数 headers: headers cookies: cookies proxy: 代理 timeOut: 请求超时时间 timeOutRetry: 超时重试次数 return: 响应 \'\'\' if not url or not para: logging.error(\'PostError url or para not exit\') return None logging.error(\'Post %s\' % url) try: if not headers: headers = {\'User-Agent\': \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\'} response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut) if response.status_code == 200 or response.status_code == 302: htmlCode = response.text else: htmlCode = None logging.error(\'Post %s %s\' % (str(response.status_code), url)) except Exception as e: logging.error(\'PostExcept %s\' % str(e)) if timeOutRetry > 0: htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry - 1)) else: logging.error(\'PostTimeOut %s\' % url) htmlCode = None return htmlCode def confirm(self, htmlCode, url, headers, cookies, proxy, catch_retry=5): \'\'\' 反爬,验证页面 htmlCode:网页源码 return:网页源码 \'\'\' # 获取网页title判断是否被ban return htmlCode def urlprocess(self, items): # + URL 中+号表示空格 %2B # 空格 URL中的空格可以用+号或者编码 %20 # / 分隔目录和子目录 %2F # ? 分隔实际的URL和参数 %3F # % 指定特殊字符 %25 # # 表示书签 %23 # & URL 中指定的参数间的分隔符 %26 # = URL 中指定参数的值 %3D content = items.replace(\'/\', \'%2F\').replace(\'=\', \'%3D\').replace(\'+\', \'%2B\').replace( \ \' \', \'%20\').replace(\'/\', \'%2F\').replace(\'?\', \'%3F\').replace(\'=\', \'%3D\') return content
# parse.py import re import demjson class Parse: \'\'\' 解析网页信息 \'\'\' def __init__(self, htmlCode): self.htmlCode = htmlCode self.json = demjson.decode(htmlCode) pass def parseTool(self, content): \'\'\' 清除html标签 \'\'\' if type(content) != str: return content sublist = [\'<p.*?>\', \'</p.*?>\', \'<b.*?>\', \'</b.*?>\', \'<div.*?>\', \'</div.*?>\', \'</br>\', \'<br />\', \'<ul>\', \'</ul>\', \'<li>\', \'</li>\', \'<strong>\', \'</strong>\', \'<table.*?>\', \'<tr.*?>\', \'</tr>\', \'<td.*?>\', \'</td>\', \'\r\', \'\n\', \'&.*?;\', \'&\', \'#.*?;\', \'<em>\', \'</em>\'] try: for substring in [re.compile(string, re.S) for string in sublist]: content = re.sub(substring, "", content).strip() except: raise Exception(\'Error \' + str(substring.pattern)) return content def parsePage(self): \'\'\' 解析并计算页面数量 return: 页面数量 \'\'\' totalCount = self.json[\'content\'][\'positionResult\'][\'totalCount\'] # 职位总数量 resultSize = self.json[\'content\'][\'positionResult\'][\'resultSize\'] # 每一页显示的数量 pageCount = int(totalCount) // int(resultSize) + 1 # 页面数量 return pageCount def parseInfo(self): \'\'\' 解析信息 \'\'\' info = [] for position in self.json[\'content\'][\'positionResult\'][\'result\']: i = {} i[\'companyName\'] = position[\'companyFullName\'] i[\'positionType\'] = position[\'firstType\'] i[\'positionName\'] = position[\'positionName\'] i[\'companyStage\'] = position[\'financeStage\'] i[\'companySize\'] = position[\'companySize\'] i[\'positionEducation\'] = position[\'education\'] i[\'positionWorkYear\'] = position[\'workYear\'] i[\'positionSalary\'] = position[\'salary\'] i[\'district\'] = position[\'district\'] i[\'latitude\'] = position[\'latitude\'] i[\'longitude\'] = position[\'longitude\'] i[\'companyType\'] = position[\'industryField\'] i[\'positionLables\'] = position[\'positionLables\'] i[\'companyLabel\'] = position[\'companyLabelList\'] i[\'positionAdvantage\'] = position[\'positionAdvantage\'] info.append(i) return info
# setting.py headers = { \'Host\': \'www.lagou.com\', \'Connection\': \'keep-alive\', \'Content-Length\': \'23\', \'Origin\': \'https://www.lagou.com\', \'X-Anit-Forge-Code\': \'0\', \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\', \'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\', \'Accept\': \'application/json, text/javascript, */*; q=0.01\', \'X-Requested-With\': \'XMLHttpRequest\', \'X-Anit-Forge-Token\': \'None\', \'Referer\': \'https://www.lagou.com/jobs/list_java?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=\', \'Accept-Encoding\': \'gzip, deflate, br\', \'Accept-Language\': \'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7\' } # cookies cookies = { \'user_trace_token\': \'20171011085044-36afc724-ae1e-11e7-947d-5254005c3644\', \'LGUID\': \'20171011085044-36afc9e6-ae1e-11e7-947d-5254005c3644\', \'_ga\': \'GA1.2.1411877279.1507683044\', \'index_location_city\': \'%E5%B9%BF%E5%B7%9E\', \'JSESSIONID\': \'ABAAABAAADEAAFI2466B2149D4B3E406932CAEA37FDF471\', \'_gid\': \'GA1.2.1604143331.1517585155\', \'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6\': \'1515000882,1515252738,1516984463,1517585156\', \'LGSID\': \'20180202232556-5ce93c91-082d-11e8-abfa-5254005c3644\', \'PRE_UTM\': \'\', \'PRE_HOST\': \'\', \'PRE_SITE\': \'\', \'PRE_LAND\': \'https%3A%2F%2Fwww.lagou.com%2F\', \'TG-TRACK-CODE\': \'index_navigation\', \'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6\': \'1517585322\', \'LGRID\': \'20180202232842-c0095589-082d-11e8-abfa-5254005c3644\', \'SEARCH_ID\': \'0a887843a48a49c7bb6dae915dabdcc1\' } # IP池 # 0(pay) or 1(free) or 2(None) TAGIP = 0 # IP IP = [] # UA UA = [\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)\', \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)\', \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)\', \'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6\', \'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13\', \'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13\', \'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)\', \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\', \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)\', \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \ (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\', \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1\', \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\', \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \ Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11\', \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \ Chrome/21.0.1180.71 Safari/537.1 LBBROWSER\', \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \ .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) \', \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \ .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)\', \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)\', \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) \', \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1\', \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \ Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0\', \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0\', \'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \ Version/5.0.2 Mobile/8C148 Safari/6533.18.5\', \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre\', \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\', \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)\']