pyhjy

爬取前程无忧信息,并保存到数据库中

import urllib.request ##请求
import ssl
import re
import xlwt
import pymysql


ssl._create_default_https_context = ssl._create_unverified_context

##去爬取数据,返回的是HTML页面的内容
def getContent(name,j):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36",
        \'Connection\': \'keep-alive\'
    }
    j = j+1
    url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="%(name,j)

    ##请求对象(URL + 请求头)
    req = urllib.request.Request(url, headers=headers)

    ##获取页面内容
    page = urllib.request.urlopen(req).read()

    ##对获取的到内容,设置编码:防止中文乱码
    page = page.decode("GBK")

    return page

##使用正则找出 页面中 工作 相关的信息
def getItem(content):
    pattern = re.compile(r\'"job_href":"(.+?)","job_name":"(.+?)".+?"company_href":"(.+?)","company_name":"(.+?)","providesalary_text":"(.*?)".+?"workarea_text":"(.*?)","updatedate":"(.*?)".*?"companytype_text":"(.*?)","degreefrom":"(.*?)".*?"attribute_text":(.*?),"companysize_text":"(.*?)",.*?,"companyind_text":"(.*?)".*?\')
    res = re.findall(pattern,content)
    return res

##将找出的信息存储在Excel表格中
def saveExcel(list):
    ##(2)工作簿
    wb = xlwt.Workbook()

    ##(3)表
    sheet = wb.add_sheet("数据分析50")

    ##(4)写数据:一行一行的写
    header = ["公司的名字", "公司的网址","公司类型","公司规模","行业","工作地点","岗位名字", "待遇","岗位详情", "发布时间","学历","招聘要求"]
    ##表头
    for (i,v) in enumerate(header):
        sheet.write(0,i,v)
    ##(0岗位详情,1岗位名字,2公司的网址,3公司的名字,4待遇,5工作地点,6发布时间,7公司类型,8学历,9招聘要求,10公司规模,11行业)
    for (i,tuple) in enumerate(list):
        sheet.write(i + 1, 0, tuple[3])
        sheet.write(i + 1, 1, tuple[2])
        sheet.write(i + 1, 2, tuple[7])
        sheet.write(i + 1, 3, tuple[10])
        sheet.write(i + 1, 4, tuple[11])
        sheet.write(i + 1, 5, tuple[5])
        sheet.write(i + 1, 6, tuple[1])
        sheet.write(i + 1, 7, tuple[4])
        sheet.write(i + 1, 8, tuple[0])
        sheet.write(i + 1, 9, tuple[6])
        sheet.write(i + 1, 10, tuple[8])
        sheet.write(i + 1, 11, tuple[9])

    ##保存
    wb.save("51job2.xls")

list=[]
name = input("请输入您想要搜索的行业")
for j in range(0,201):
    print("正在为您查询第%s页数据,请不要进行任何操作或退出程序。"%(j+1))
    aaa = getContent(name,j)
    content = getItem(aaa)
    list.extend(content)

def saveMysql(list):
    conn = pymysql.connect(host="localhost",
                           user="root",
                           password="123",
                           database="xmmysql",
                           charset="utf8")
    cursor = conn.cursor()  ##创建游标(新建查询会话),通过游标执行SQL语句
    for i in list:
        sql = "insert into sjfx(name,wz,leix,gm,hy,gzdd,gwmz,dy,gwxq,fbsj,xl,zpyq) values(\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"%(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8],i[9],i[10],i[11])
        cursor.execute(sql)  ##将SQL语句放入游标中,准备执行
    conn.commit()  ##提交
    cursor.close()
    conn.close()

# saveExcel(list)
# saveMysql(list)

 

分类:

技术点:

相关文章:

  • 2022-12-23
  • 2021-07-08
  • 2021-11-18
  • 2022-02-15
  • 2022-12-23
  • 2021-07-20
  • 2021-12-05
  • 2021-06-18
猜你喜欢
  • 2021-10-12
  • 2021-04-16
  • 2021-06-19
  • 2021-11-04
  • 2022-01-17
  • 2021-12-07
  • 2022-01-19
相关资源
相似解决方案