rank

爬虫-广东人事考试网

广东人事考试网登录网址:https://ggfw.gdhrss.gov.cn/sydwbk/index.do

查看报名统计详情竟然不支持搜索和筛选,想看下报名人数还挺费劲,而且要查看岗位信息还要另外单独去查询。。

中间省略N字的吐槽,开搞

 

1.第一步先登录,打开查看报名统计详情

 

2.F12打开开发人员工具,点开网络,然后开始记录网络日志。

如上图点击查询按钮,会发现只有一个请求,请求方法POST,请求URL: https://ggfw.gdhrss.gov.cn/sydwbk/exam/details/spQuery.do

表单数据有四个bfa001、bab301、pages、rows

到这里思路就很清晰了,因为我们需要爬取全部数据,那就只需要bfa001、bab301这两项就好了,那就先试一下看看返回结果

 1 import os
 2 import sys
 3 import requests
 4 
 5 url = 'https://ggfw.gdhrss.gov.cn/sydwbk/exam/details/spQuery.do'
 6 data = {'bfa001':'2210841','bab301':'05'}
 7 headers = {
 8 'user-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
 9 'Cookie':'JSESSIONID=AKnYcHeRTlEK6iUGWsDPJZ-chm4mlGvT38vzrbuI-lhK4WIgJztk!29357817',
10 }
11 r = requests.post(url,data=data, headers = headers)
12 
13 with open('C:/Users/lenovo/Desktop/html.txt','w',encoding='utf-8') as f:
14     f.write(r.text)
15 os.system('C:/Users/lenovo/Desktop/html.txt')

返回json如下

{
    "rows": [{
            "aab004": "佛山市三水区公路养护中心",
            "aab019": 1,
            "aab119": 31,
            "aae036": 1648546541000,
            "bab301": "05",
            "bfe301": "2210841050001",
            "bfe3a4": "综合室工作岗位",
            "id": {
                "bfa001": "2210841",
                "bfz315": "87365"
            }
        },
        {
            "aab004": "佛山市三水区西南街道社区卫生服务中心",
            "aab019": 3,
            "aab119": 3,
            "aae036": 1648546541000,
            "bab301": "05",
            "bfe301": "2210841050002",
            "bfe3a4": "临床医师",
            "id": {
                "bfa001": "2210841",
                "bfz315": "86909"
            }
        },
... 中间省略 ...
        {
            "aab004": "佛山市三水区大塘镇宣传文体旅游办公室(佛山市三水区大塘镇教育办公室)",
            "aab019": 3,
            "aab119": 57,
            "aae036": 1648546541000,
            "bab301": "05",
            "bfe301": "2210841050128",
            "bfe3a4": "数学教师",
            "id": {
                "bfa001": "2210841",
                "bfz315": "87384"
            }
        }
    ],
    "total": 128
}

为了搞懂bfa001、bab301,那点开审查元素搜索一下,找到两项

从上图可看出bab301是城市代码

这里可以看出bfa001=2210841是固定值,结合json结果每一列对应含义都找到了

然后再完善一下,数据存到sqlite,完整代码如下:

#!/usr/bin/python
#coding:utf-8

import re
import requests
import json
import urllib
import os
import sys
import random
import time
from datetime import datetime
import sqlite3
from flask import Flask
from flask_sqlalchemy import SQLAlchemy

def Randomheader():
    user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 

Safari/537.36','Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 

Safari/537.36','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36','Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418.8 (KHTML, like Gecko, Safari) 

Cheshire/1.0.UNOFFICIAL','Mozilla/5.0 (X11; U; Linux i686; nl; rv:1.8.1b2) Gecko/20060821 BonEcho/2.0b2 (Debian-1.99+2.0b2+dfsg-1)','Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) 

Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/4.0 

(compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 10.0; WOW64) 

AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 

Version/11.11','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36','Mozilla/5.0 (Windows NT 6.1; WOW64; 

Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 

1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; 

Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 

3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET 

CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, 

Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-

Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) 

Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, 

like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 

TaoBrowser/2.0 Safari/536.11","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; 

.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/5.0 

(Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET 

CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; 

.NET4.0E; QQBrowser/7.0.3698.400)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; 

.NET4.0E; 360SE)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; 

.NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)","Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, 

like Gecko) Chrome/21.0.1180.89 Safari/537.1","Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5","Mozilla/5.0 (Windows NT 6.1; Win64; 

x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 

Safari/537.11","Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 

Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 

Safari/537.36','Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 

Safari/537.36','Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6','Mozilla/5.0 

(Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36','Mozilla/5.0 (X11; CrOS i686 

3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'
] 
    UserAgent=random.choice(user_agent_list)
    return {
'user-agent':UserAgent,
'Cookie':'JSESSIONID=AKnYcHeRTlEK6iUGWsDPJZ-chm4mlGvT38vzrbuI-lhK4WIgJztk!29357817',
'Host': 'ggfw.gdhrss.gov.cn',
'Origin': 'https://ggfw.gdhrss.gov.cn',
'Connection': 'keep-alive',
}

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///db.sqlite3'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
app.config['SQLALCHEMY_COMMIT_ON_TEARDOWN'] = True
db = SQLAlchemy(app)

class Exam(db.Model):
    id = db.Column(db.String(255), primary_key=True)
    city = db.Column(db.String(255), primary_key=True)
    gwdm = db.Column(db.String(255), primary_key=True)
    zpdw = db.Column(db.String(255))
    zpgw = db.Column(db.String(255))
    pyrs = db.Column(db.Integer)
    bmrs = db.Column(db.Integer)
    tjsj = db.Column(db.DateTime)

    def __repr__(self):
        return f'{self.bmrs}\t{self.gwdm} {self.zpdw} {self.zpgw}'

def get_exam_by_city(city_code):
    url = 'https://ggfw.gdhrss.gov.cn/sydwbk/exam/details/spQuery.do'
    data = {'bfa001':'2210841','bab301':city_code}
    r = requests.post(url,data=data, headers = Randomheader())

    try:
        rows = json.loads(r.text)
        exam_list = []
        for row in rows['rows']:
            exam = Exam.query.filter_by(id=row['id']['bfz315'],city=row['bab301'],gwdm=row['bfe301']).first()
            if exam:
                exam.bmrs=row['aab119']
                exam.tjsj=datetime.fromtimestamp(row['aae036']/1000)
            else:
                exam = Exam(id=row['id']['bfz315'],city=row['bab301'], gwdm=row['bfe301'], zpdw=row['aab004'], zpgw=row['bfe3a4'], pyrs=row['aab019'], bmrs=row['aab119'], tjsj=datetime.fromtimestamp(row['aae036']/1000) )
            if exam.id and exam.city and exam.gwdm:            
                exam_list.append(exam)
        db.session.add_all(exam_list)
        db.session.commit()
        print(f"【{city_code}】爬取数目={rows['total']} 数据库数目={Exam.query.filter_by(city=city_code).count()}")
    except Exception as ex:
        print(repr(ex))
    finally:
        db.session.close()

if __name__ == '__main__':
    db.create_all()  
    get_exam_by_city('99')  # 省直
    for i in range(21):
        time.sleep(10)  # 防止访问过于频繁
        get_exam_by_city('%02d'% (i+1))

 

3.至此,招聘信息和岗位报名人数拿到了,但是没有岗位详情。

招聘公告 附件有公开招聘岗位表

把Excel表下载下来导入到数据库,结合上面的Exam表来查询。

导入完整代码如下:

#!/usr/bin/python
#coding:utf-8

import re
import requests
import json
import urllib
import os
import sys
import random
import time
from datetime import datetime
import sqlite3
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
import openpyxl


app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///db.sqlite3'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
app.config['SQLALCHEMY_COMMIT_ON_TEARDOWN'] = True
db = SQLAlchemy(app)

class Job(db.Model):
    city_name = db.Column(db.String(255))
    dwbh = db.Column(db.String(255), primary_key=True)
    gwdm = db.Column(db.String(255), primary_key=True)
    zpdw = db.Column(db.String(255))
    zpgw = db.Column(db.String(255))
    gwjj = db.Column(db.String(255))
    gwdj = db.Column(db.String(255))
    xlyq = db.Column(db.String(255))
    xwyq = db.Column(db.String(255))
    zyyq = db.Column(db.String(255))
    gzjl = db.Column(db.String(255))

    def __repr__(self):
        return f'{self.dwbh}\t{self.gwdm} {self.zpdw} {self.zpgw}'

if __name__ == '__main__':
    db.create_all()
    wb = openpyxl.load_workbook('C:/Users/lenovo/Desktop/附件1:公开招聘岗位表.xlsx')
    sheet = wb.worksheets[0]
    row_index = 0
    job_list = []
    for row in sheet.iter_rows():
        row_index = row_index + 1
        row_str = str(row_index)
        job = Job(
city_name = sheet['A'+row_str].value,
dwbh = sheet['B'+row_str].value,
gwdm = sheet['C'+row_str].value,
zpdw = sheet['D'+row_str].value,
zpgw = sheet['E'+row_str].value,
gwjj = sheet['F'+row_str].value,
gwdj = sheet['G'+row_str].value,
xlyq = sheet['K'+row_str].value,
xwyq = sheet['L'+row_str].value,
zyyq = sheet['N'+row_str].value,
gzjl = sheet['U'+row_str].value,
)
        if job.dwbh and job.gwdm and str.isdigit(job.gwdm):
            job_list.append(job)

    db.session.add_all(job_list)
    db.session.commit()

注意:Excel表第一行(就是合并单元格那行)删除后保存。

有需要的话,还可以先筛选Excel再导入。

 

数据库查询代码如下:

我这里只查看('省直','广州','佛山')三个考区,按岗位人均报名人数升序排序。

select b.city_name 考区,a.zpdw 招聘单位,a.zpgw 招聘岗位,a.gwdm 岗位代码,b.gwdj 岗位等级,b.gwjj 岗位简介,b.xlyq 学历要求,b.xwyq 学位要求,b.zyyq 专业要求,b.gzjl 工作经历,a.pyrs 聘用人数 ,a.bmrs 报名人数,a.tjsj 更新时间
from exam a,job b where a.gwdm=b.gwdm
and b.city_name in ('省直','广州','佛山')
order by a.bmrs/a.pyrs;

 

相关文章: