基本要求:
分析网页源码后:
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import io
import sys
import csv
import re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=\'utf-8\')
f = open(\'abc.csv\',\'w\',encoding=\'gb18030\',newline=\'\')
csv_writer = csv.writer(f)
csv_writer.writerow(["标题","人物" , "岗位","难度","时长","学习人数" ,"综合评分" ,"内容实用" ,"简洁易懂" ,"逻辑清晰" ,"姓名","得分","评论内容","点赞数","时间" ,"简介"])
#open every page
def get_text(url):
res = requests.get(url)
res.encode = \'utf-8\'
content = res.text
doc = BeautifulSoup(content ,\'lxml\')
#print(doc)
return doc
def get_page1(doc1):
pageinfo1 = doc1.find(\'div\' ,{\'class\' : \'course-nav-row course-nav-skills clearfix\'})
url1 = pageinfo1.findAll(\'a\')
url2 = url1[1:]
longPage = []
for item in range(len(url2)):
attr1 = url2[item].attrs[\'href\']
attr2 = \'https://www.imooc.com/course/list\' + attr1[12:] + \'&sort=pop\'
#print(attr2)
longPage.append(get_text(attr2))
return longPage
#find avaliable message
def get_page(doc2):
pageinfo1=doc2.findAll(\'a\',{\'class\':\'course-card\'});
shortTitleDoc = []
for item in range(len(pageinfo1)):
href1 = pageinfo1[item].attrs[\'href\']
if href1[0:6] == \'/learn\':
href2 = \'https://www.imooc.com/coursescore\' + href1[6:]
shortTitleDoc.append(get_text(href2))
if len(shortTitleDoc) == 3:
return shortTitleDoc
return shortTitleDoc
def get_introduceDoc(doc4):
pageinfo1=doc4.findAll(\'a\',{\'class\':\'course-card\'});
shortTitleDoc2 = []
for item in range(len(pageinfo1)):
href1 = pageinfo1[item].attrs[\'href\']
if href1[0:6] == \'/learn\':
href2 = \'https://www.imooc.com/learn\' + href1[6:]
shortTitleDoc2.append(get_text(href2))
if len(shortTitleDoc2) == 3:
return shortTitleDoc2
return shortTitleDoc2
def get_AjaxSourceData(doc):
pageinfo1=doc.findAll(\'a\',{\'class\':\'course-card\'});
AjaxData = []
for item in range(len(pageinfo1)):
href1 = pageinfo1[item].attrs[\'href\']
if href1[0:6] == \'/learn\':
href2 = \'https://www.imooc.com/course/AjaxCourseMembers?ids=\' + href1[7:]
#print(href2)
AjaxData.append(get_text(href2))
if len(AjaxData) == 3:
return AjaxData
return AjaxData
def get_page_text(shortTitleDoc ,doc5 ,AjaxData):
#顶部部分
h2 = shortTitleDoc.find(\'div\' ,{\'class\':\'hd clearfix\'}).find(\'h2\').text
#print(h2)
title = shortTitleDoc.find(\'div\' ,{\'class\' : \'statics clearfix\'})
name = shortTitleDoc.find(\'div\' ,{\'class\' :\'teacher-info l\'})
details = shortTitleDoc.findAll(\'div\' ,{\'class\' : \'static-item\'})
name1 = name.find(\'span\' ,{\'class\' : \'tit\'}).find(\'a\').text
job1 = name.find(\'span\' ,{\'class\' :\'job\'}).text
metaValue = []
for item in range(4):
meta = details[item].find(\'span\' ,{\'class\' : \'meta\'}).text
metaValue.append(details[item].find(\'span\' ,{\'class\' : \'meta-value\'}).text)
#print(metaValue[0])
r = re.findall(\'numbers":"(.*?)"\',str(AjaxData))
metaValue[2] = r[0]
#中间部分
content = shortTitleDoc.find(\'div\' ,{\'class\' : \'evaluation-info\'})
#a = content.find(\'div\' ,{\'class\' : \'evaluation-title\'}).text
score = content.find(\'div\' ,{\'class\' : \'evaluation-score\'}).text
li = content.findAll(\'li\')
metaValue1 = []
for item in range(3):
metaValue1.append(li[item].find(\'span\').text)
#评论区部分
comment = shortTitleDoc.find(\'div\' ,{\'class\' : \'evaluation-list\'})
commentLen = comment.findAll(\'div\' ,{\'class\' : \'evaluation evaluate\'})
#print(len(commentLen))
commentName = commentLen[0].find(\'a\' ,{\'class\' : \'username\'}).text
commentScore = commentLen[0].find(\'div\' ,{\'class\' : \'star-box\'}).find(\'span\').text
commentContent = commentLen[0].find(\'p\' ,{\'class\' : \'content\'}).text
commentPraise = commentLen[0].find(\'div\' ,{\'class\' : \'info clearfix\'}).find(\'em\').text
commentTime = commentLen[0].find(\'span\' ,{\'class\' : \'time\'}).text
content1 = doc5.find(\'div\' ,{\'class\' :\'course-description course-wrap\'}).text
csv_writer.writerow([h2 ,name1 ,job1 ,metaValue[0] ,metaValue[1] ,metaValue[2] ,metaValue[3] ,metaValue1[0] ,metaValue1[1] ,metaValue1[2],commentName,commentScore,commentContent,commentPraise,commentTime ,content1])
doc3 = get_text(\'https://www.imooc.com/course/list\')
longPage1 = get_page1(doc3)
for item in range(len(longPage1)):
shortTitleDoc1 = get_page(longPage1[item])
shortTitleDoc3 = get_introduceDoc(longPage1[item])
AjaxData = get_AjaxSourceData(longPage1[item])
for i in range(len(shortTitleDoc1)):
get_page_text(shortTitleDoc1[i] ,shortTitleDoc3[i] ,AjaxData[i])
#get_introduce(shortTitleDoc3[i])
#print(len(shortTitleDoc1))
f.close()
结果保存在CSV表格当中(部分数据)