Crush999

基本要求:

 

 

 分析网页源码后:

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import io
import sys
import csv
import re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=\'utf-8\')

f = open(\'abc.csv\',\'w\',encoding=\'gb18030\',newline=\'\')
csv_writer = csv.writer(f)
csv_writer.writerow(["标题","人物" , "岗位","难度","时长","学习人数" ,"综合评分" ,"内容实用" ,"简洁易懂" ,"逻辑清晰" ,"姓名","得分","评论内容","点赞数","时间" ,"简介"])

#open every page
def get_text(url):
	res = requests.get(url)
	res.encode = \'utf-8\'
	content = res.text
	doc = BeautifulSoup(content ,\'lxml\')
	#print(doc)
	return doc


def get_page1(doc1):
	pageinfo1 = doc1.find(\'div\' ,{\'class\' : \'course-nav-row course-nav-skills clearfix\'})
	url1 = pageinfo1.findAll(\'a\')
	url2 = url1[1:]
	longPage = []
	for item in range(len(url2)):
		attr1 = url2[item].attrs[\'href\']
		attr2 = \'https://www.imooc.com/course/list\' + attr1[12:] + \'&sort=pop\'
		#print(attr2)
		longPage.append(get_text(attr2))
	return longPage
		

#find avaliable message
def get_page(doc2):
	pageinfo1=doc2.findAll(\'a\',{\'class\':\'course-card\'});
	shortTitleDoc = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs[\'href\']
		if href1[0:6] == \'/learn\':
			href2 = \'https://www.imooc.com/coursescore\' + href1[6:]
			shortTitleDoc.append(get_text(href2))
		if len(shortTitleDoc) == 3:
			return shortTitleDoc
	return shortTitleDoc

def get_introduceDoc(doc4):
	pageinfo1=doc4.findAll(\'a\',{\'class\':\'course-card\'});
	shortTitleDoc2 = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs[\'href\']
		if href1[0:6] == \'/learn\':
			href2 = \'https://www.imooc.com/learn\' + href1[6:]
			shortTitleDoc2.append(get_text(href2))
		if len(shortTitleDoc2) == 3:
			return shortTitleDoc2
	return shortTitleDoc2

def get_AjaxSourceData(doc):
	pageinfo1=doc.findAll(\'a\',{\'class\':\'course-card\'});
	AjaxData = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs[\'href\']
		if href1[0:6] == \'/learn\':
			href2 = \'https://www.imooc.com/course/AjaxCourseMembers?ids=\' + href1[7:]
			#print(href2)
			AjaxData.append(get_text(href2))
		if len(AjaxData) == 3:
			return AjaxData
	return AjaxData
	

def get_page_text(shortTitleDoc ,doc5 ,AjaxData):
	#顶部部分
	h2 = shortTitleDoc.find(\'div\' ,{\'class\':\'hd clearfix\'}).find(\'h2\').text
	#print(h2)
	title = shortTitleDoc.find(\'div\' ,{\'class\' : \'statics clearfix\'})
	name = shortTitleDoc.find(\'div\' ,{\'class\' :\'teacher-info l\'})
	details = shortTitleDoc.findAll(\'div\' ,{\'class\' : \'static-item\'})

	name1 = name.find(\'span\' ,{\'class\' : \'tit\'}).find(\'a\').text
	job1 = name.find(\'span\' ,{\'class\' :\'job\'}).text

	metaValue = []
	for item in range(4):
		meta = details[item].find(\'span\' ,{\'class\' : \'meta\'}).text
		metaValue.append(details[item].find(\'span\' ,{\'class\' : \'meta-value\'}).text)
	#print(metaValue[0])
	r = re.findall(\'numbers":"(.*?)"\',str(AjaxData))
	metaValue[2] = r[0]

	#中间部分
	content = shortTitleDoc.find(\'div\' ,{\'class\' : \'evaluation-info\'})
	#a = content.find(\'div\' ,{\'class\' : \'evaluation-title\'}).text
	score = content.find(\'div\' ,{\'class\' : \'evaluation-score\'}).text
	li = content.findAll(\'li\')
	metaValue1 = []
	for item in range(3):
		metaValue1.append(li[item].find(\'span\').text)
		
	#评论区部分
	comment = shortTitleDoc.find(\'div\' ,{\'class\' : \'evaluation-list\'})
	commentLen = comment.findAll(\'div\' ,{\'class\' : \'evaluation evaluate\'})
	#print(len(commentLen))
	commentName = commentLen[0].find(\'a\' ,{\'class\' : \'username\'}).text
	commentScore = commentLen[0].find(\'div\' ,{\'class\' : \'star-box\'}).find(\'span\').text
	commentContent = commentLen[0].find(\'p\' ,{\'class\' : \'content\'}).text
	commentPraise = commentLen[0].find(\'div\' ,{\'class\' : \'info clearfix\'}).find(\'em\').text
	commentTime = commentLen[0].find(\'span\' ,{\'class\' : \'time\'}).text

	content1 = doc5.find(\'div\' ,{\'class\' :\'course-description course-wrap\'}).text

	csv_writer.writerow([h2 ,name1 ,job1 ,metaValue[0] ,metaValue[1] ,metaValue[2] ,metaValue[3] ,metaValue1[0] ,metaValue1[1] ,metaValue1[2],commentName,commentScore,commentContent,commentPraise,commentTime ,content1])

doc3 = get_text(\'https://www.imooc.com/course/list\')
longPage1 = get_page1(doc3)
for item in range(len(longPage1)):
	shortTitleDoc1 = get_page(longPage1[item])
	shortTitleDoc3 = get_introduceDoc(longPage1[item])
	AjaxData = get_AjaxSourceData(longPage1[item])
	for i in range(len(shortTitleDoc1)):
		get_page_text(shortTitleDoc1[i] ,shortTitleDoc3[i] ,AjaxData[i])
		#get_introduce(shortTitleDoc3[i])
		#print(len(shortTitleDoc1))
f.close()

  结果保存在CSV表格当中(部分数据)

分类:

技术点:

相关文章: