Python网络爬虫———职位数据抓取及分析
一、选题的背景
为什么要选择此选题?要达到的数据分析的预期目标是什么?
在毕业之际,学生在毕业后并不了解计算机岗位的薪资待遇为了能让学生能了解计算机行业的最高薪资以及平均薪资让每个学生更好认清行业的前期。
二、主题是网络爬虫设计方案
1、名称:人才网招聘爬虫
2、爬取内容的数据与分析特征
此次爬取的内容数据有:工作岗位、公司、薪资、发布日期、招聘信息地址
3、爬虫设计方案
1.根据网页的样式进行目标选取
2.进行目标内容取出
3.内容保存
4.技术难点:主要有文件保存,以及内容取出,反爬机制
三、主题页面的结构特征分
使用beautifulsoup解析页面,获取JS中所需数据
soup.find_all(\'script\')[7]
四、网络爬虫程序设计
1.数据爬取与采集
import time import requests from bs4 import BeautifulSoup import os import csv import re def analysis(item,results): pattern = re.compile(item, re.I|re.M) result_list = pattern.findall(results) return result_list def precess(item): return item.replace(\',\', \' \').replace(\'\\\', \'\') #构建请求头 headers = { \'User-Agent\':\'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0\' } url_pattern = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" if not os.path.exists("intro_job.csv"): #创建存储csv文件存储数据 file = open(\'intro_job.csv\', "w", encoding="utf-8-sig",newline=\'\') csv_head = csv.writer(file) #表头 header = [\'job\',\'company\',\'place\',\'salary\',\'date\',\'detail_url\'] csv_head.writerow(header) file.close()
2.对数据进行清洗和处理
1.数据清洗
import pandas as pd from matplotlib import pyplot as plt import matplotlib.ticker as ticker import numpy as np import math import re #读取数据 df = pd.read_csv(\'intro_job.csv\', encoding=\'utf-8-sig\',usecols=["job", "company", "place", "salary", "date"]) #将相应字段数据保存至列表中 job_array = df[\'job\'].values company_array = df[\'company\'].values place_array = df[\'place\'].values salary_array = df[\'salary\'].values date_array = df[\'date\'].values #去除无效含有无效字段的数据,同时也需要去除其他列表中对应位置的数据 bool_array = np.ones_like(salary_array,dtype=np.bool) for i in range(len(salary_array)): if isinstance(salary_array[i],float): bool_array[i] = False print(len(job_array)) print(sum(bool_array)) job_array = job_array[bool_array] print(len(job_array)) company_array = company_array[bool_array] place_array = place_array[bool_array] salary_array = salary_array[bool_array] date_array = date_array[bool_array]
2.数据处理
#将工作地点转换成市,如"上海-浦东" => "上海" place_array_city = [] for place in place_array: if re.findall(\'-\',place): place_array_city.append(place[:place.find(\'-\')]) else: place_array_city.append(place) def calc_money(salary_tmp): if re.findall(\'千\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'千\')] elif re.findall(\'万\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'万\')] elif re.findall(\'元\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'元\')] if re.findall(\'-\',salary_tmp): salary_tmp = salary_tmp.split(\'-\') #print(salary_tmp) return (float(salary_tmp[0]) + float(salary_tmp[1])) / 2 else: return float(salary_tmp) def calc_total(salary_tmp): money = calc_money(salary_tmp) if salary_tmp[-1] == \'千\': money *= 1000 elif salary_tmp[-1] == \'万\': money *= 10000 return money def calc_mean(salary): if re.findall(\'小时\',salary): salary_tmp = salary[:-3] else: salary_tmp = salary[:-2] money = calc_total(salary_tmp) if re.findall(\'年\',salary): money /= 12.0 elif re.findall(\'天\',salary): money *= 30 elif re.findall(\'小时\',salary): money = money * 8 * 20 return money #计算平均月薪 salary_array_mean = [] for salary in salary_array: money = calc_mean(salary) salary_array_mean.append(money) #城市中岗位数目字典,如"上海":1300,表示上海有1300个相关岗位 city_dict = {} #城市中岗位薪酬字典,如"上海":10000,表示上海计算机软件工作岗位总月薪为10000,除以对应城市岗位数即为该城市平均月薪 salary_dict = {} for i in range(len(place_array_city)): if city_dict.get(place_array_city[i]): city_dict[place_array_city[i]]+=1 salary_dict[place_array_city[i]] += salary_array_mean[i] else: city_dict[place_array_city[i]] = 1 salary_dict[place_array_city[i]] = salary_array_mean[i]
4.数据可视化与分析
#全国计算机软件平均月薪 mean_salary = sum(salary_array_mean)/len(salary_array_mean) #字典排序 d_order=sorted(city_dict.items(),key=lambda x:x[1],reverse=True) #前岗位数量前20平均月薪列表 mean_top_20 = [] #前岗位数量前20列表 city_top_20 = [] for i in d_order[:20]: mean_top_20.append(salary_dict[i[0]]/i[1]) city_top_20.append(i[0]) ax = plt.axes() labels = ax.get_xticklabels() plt.plot(city_top_20,mean_top_20,marker=\'o\',label=\'各市平均月薪\') plt.plot([mean_salary]*20,\'--\',label=\'全国平均月薪\') plt.setp(labels,rotation=30.) plt.legend() plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] plt.rcParams[\'axes.unicode_minus\'] = False plt.show()
plt.scatter(place_array,salary_array,s=10, marker="o") plt.xlabel("地点") plt.ylabel("薪资") plt.title("薪资分布-散点图")
5.将以上各部分的代码汇总,附上完整程序代码
import time import requests from bs4 import BeautifulSoup import os import csv import re def analysis(item,results): pattern = re.compile(item, re.I|re.M) result_list = pattern.findall(results) return result_list def precess(item): return item.replace(\',\', \' \').replace(\'\\\', \'\') #构建请求头 headers = { \'User-Agent\':\'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0\' } url_pattern = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" if not os.path.exists("intro_job.csv"): #创建存储csv文件存储数据 file = open(\'intro_job.csv\', "w", encoding="utf-8-sig",newline=\'\') csv_head = csv.writer(file) #表头 header = [\'job\',\'company\',\'place\',\'salary\',\'date\',\'detail_url\'] csv_head.writerow(header) file.close() for i in range(1,2001): #增加时延防止反爬虫 time.sleep(5) url = url_pattern.format(i) response = requests.get(url=url, headers=headers) #声明网页编码方式,需要根据具体网页响应情况 response.encoding = \'gbk\' response.raise_for_status() soup = BeautifulSoup(response.text, \'html.parser\') #pattern = re.compile(r"engine_search_result\'(.*?)\'",re.MULTILINE|re.DOTALL) results = str(soup.find_all(\'script\')[7]) job_names = analysis(r\'"job_name":"(.*?)"\', results) company_names = analysis(r\'"company_name":"(.*?)"\', results) workarea_texts = analysis(r\'"workarea_text":"(.*?)"\', results) providesalary_texts = analysis(r\'"providesalary_text":"(.*?)"\', results) updatedates = analysis(r\'"updatedate":"(.*?)"\', results) job_hrefs = analysis(r\'"job_href":"(.*?)"\', results) for i in range(len(job_names)): with open(\'intro_job.csv\', \'a+\', encoding=\'utf-8-sig\') as f: f.write(precess(job_names[i]) + \',\' + precess(company_names[i]) + \',\' + precess(workarea_texts[i]) + \',\' + precess(providesalary_texts[i]) + \',\' + precess(updatedates[i]) +\',\' + precess(job_hrefs[i]) + \'\n\') import pandas as pd from matplotlib import pyplot as plt import matplotlib.ticker as ticker import numpy as np import math import re #读取数据 df = pd.read_csv(\'intro_job.csv\', encoding=\'utf-8-sig\',usecols=["job", "company", "place", "salary", "date"]) #将相应字段数据保存至列表中 job_array = df[\'job\'].values company_array = df[\'company\'].values place_array = df[\'place\'].values salary_array = df[\'salary\'].values date_array = df[\'date\'].values #去除无效含有无效字段的数据,同时也需要去除其他列表中对应位置的数据 bool_array = np.ones_like(salary_array,dtype=np.bool) for i in range(len(salary_array)): if isinstance(salary_array[i],float): bool_array[i] = False print(len(job_array)) print(sum(bool_array)) job_array = job_array[bool_array] print(len(job_array)) company_array = company_array[bool_array] place_array = place_array[bool_array] salary_array = salary_array[bool_array] date_array = date_array[bool_array] place_array_city = [] for place in place_array: if re.findall(\'-\',place): place_array_city.append(place[:place.find(\'-\')]) else: place_array_city.append(place) def calc_money(salary_tmp): if re.findall(\'千\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'千\')] elif re.findall(\'万\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'万\')] elif re.findall(\'元\',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find(\'元\')] if re.findall(\'-\',salary_tmp): salary_tmp = salary_tmp.split(\'-\') #print(salary_tmp) return (float(salary_tmp[0]) + float(salary_tmp[1])) / 2 else: return float(salary_tmp) def calc_total(salary_tmp): money = calc_money(salary_tmp) if salary_tmp[-1] == \'千\': money *= 1000 elif salary_tmp[-1] == \'万\': money *= 10000 return money def calc_mean(salary): if re.findall(\'小时\',salary): salary_tmp = salary[:-3] else: salary_tmp = salary[:-2] money = calc_total(salary_tmp) if re.findall(\'年\',salary): money /= 12.0 elif re.findall(\'天\',salary): money *= 30 elif re.findall(\'小时\',salary): money = money * 8 * 20 return money #计算平均月薪 salary_array_mean = [] for salary in salary_array: money = calc_mean(salary) salary_array_mean.append(money) #城市中岗位数目字典,如"上海":1300,表示上海有1300个相关岗位 city_dict = {} #城市中岗位薪酬字典,如"上海":10000,表示上海计算机软件工作岗位总月薪为10000,除以对应城市岗位数即为该城市平均月薪 salary_dict = {} for i in range(len(place_array_city)): if city_dict.get(place_array_city[i]): city_dict[place_array_city[i]]+=1 salary_dict[place_array_city[i]] += salary_array_mean[i] else: city_dict[place_array_city[i]] = 1 salary_dict[place_array_city[i]] = salary_array_mean[i] #全国计算机软件平均月薪 mean_salary = sum(salary_array_mean)/len(salary_array_mean) #字典排序 d_order=sorted(city_dict.items(),key=lambda x:x[1],reverse=True) #前岗位数量前20平均月薪列表 mean_top_20 = [] #前岗位数量前20列表 city_top_20 = [] for i in d_order[:20]: mean_top_20.append(salary_dict[i[0]]/i[1]) city_top_20.append(i[0]) ax = plt.axes() labels = ax.get_xticklabels() plt.plot(city_top_20,mean_top_20,marker=\'o\',label=\'各市平均月薪\') plt.plot([mean_salary]*20,\'--\',label=\'全国平均月薪\') plt.setp(labels,rotation=30.) plt.legend() plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] plt.rcParams[\'axes.unicode_minus\'] = False plt.show() plt.scatter(place_array,salary_array,s=10, marker="o") plt.xlabel("地点") plt.ylabel("薪资") plt.title("薪资分布-散点图")
五、总结
本次爬虫以及数据分析达到初步预期,但得到的对比效果不够明显,分析不够完整,从散点图上可以看出计算机岗位的提升上线是很高的。
完成本次设计,收获很多发现了很多不足的地方,对数据的分析和整合很不熟练,也提高了对python的理解。