Cookie-Jing

目标,将网页上的内容爬取下来,并实现翻页,存储为csv。

import os
from concurrent.futures.thread import ThreadPoolExecutor
from threading import Thread

import requests
from re import findall
from json import loads
import time
import pymysql
from multiprocessing import Queue

import csv
# 获取每页的内容,定义一个函数
def get_one_page(page, city_code=\'000000\'):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36\'
    }
    url = f\'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=\'
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        json_data = findall(r\'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>\', response.text)[0]
        return loads(json_data)[\'engine_search_result\']
    else:
        print(\'请求失败!\')
# 需要多少页!
start_page=1
ts=[]
for i in range(10):
    result = get_one_page(start_page)
    if not result:
        print(\'没有更多数据\')
        break
    ts.append(result)
    start_page += 1
#data_1 = get_one_page(1) #尝试保存一页的内容
data_1=[] # 创建空列表,用于存储多页


for i in range(len(ts)):
    for j in range(50):#一页50条
        data_1.append(ts[i][j])
# 我需要存储的信息

jobs = []
for job in data_1:
    job_info = [job.get(\'job_name\'),
               job.get(\'providesalary_text\'),
               job.get(\'company_name\'),
               job.get(\'companytype_text\'),
               job.get(\'workarea_text\'),
               \'-\'.join(job.get(\'attribute_text\', [\'-\', \'-\', \'-\', \'-\', \'-\'])),
               job.get(\'jobwelf\')
               ]
    jobs.append(job_info)
name=[\'job_name\',\'providesalary_text\',\'company_name\',\'companytype_text\',\'workarea_tex\',\'attribute_text\',\'jobwelf\']
test=pd.DataFrame(columns=name,data=jobs)
test.to_csv("testcsv.csv") # 保存为csv格式
test.info()
<class \'pandas.core.frame.DataFrame\'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            500 non-null    object
 1   providesalary_text  500 non-null    object
 2   company_name        500 non-null    object
 3   companytype_text    500 non-null    object
 4   workarea_tex        500 non-null    object
 5   attribute_text      500 non-null    object
 6   jobwelf             500 non-null    object
dtypes: object(7)
memory usage: 27.5+ KB


重要参考:https://gitee.com/wenhaha8/job51_analysis

分类:

技术点:

相关文章: