本文主要使用Selenium调用谷歌浏览器,爬取前程无忧(https://mkt.51job.com)网站最近发布的招聘信息的前五页内容(本文以数据分析师为例子进行爬取),完整代码如下。
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import Byurl='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
driver=webdriver.Chrome()
driver.get(url)
wait=WebDriverWait(driver,20)data=[]
details_links=[]
for j in range(5):
print('正在爬取第'+str(j)+'页信息')
#确认是否有下一页按钮
confirm_bon=wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR,'#resultList > div.dw_page > div > div > div > ul > li:nth-child(8) > a')
)
)
#解析网页
soup=BeautifulSoup(driver.page_source,'html.parser')
soup_findall= soup.find('div',class_="dw_table").find_all('div',class_='el')
for i in soup_findall[1:]: #此处不能去掉soup_findall后面的[1:],如果去掉就会报错,因为上面使用的find_all方法返回的是列表
dic={}
job_title=i.find('p',class_='t1').a['title']
details_link=i.find('p',class_='t1').a['href']company_name=i.find('span',class_='t2').a['title']
working_place=i.find('span',class_='t3').text
salary=i.find('span',class_='t4').text
public_time=i.find('span',class_='t5').textdetails_links.append(details_link)
dic={'职位名':job_title,'公司':company_name,'工作地址':working_place,'工资':salary,'发布时间':public_time}
data.append(dic)
confirm_bon.click()#data=pd.DataFrame(data)
#print(data)
pd.DataFrame(data)
运行出来的结果截图如下:(因为空间有限,只截取到了前面一点点)