1. 需要的三个包:
from wordcloud import WordCloud #词云库 import matplotlib.pyplot as plt #数学绘图库 import jieba;
2. 定义变量(将对于的变量到一个全局的文件中):
import re; pdurl_first=\'https://movie.douban.com/subject/26363254/comments?start=0\' head={\'User-Agent\':\'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36\'} reg=re.compile(r\'<a href="(.*?)&.*?class="next">\') #下一页 cookies={"__utma":"30149280.503249607.1504402391.1504402391.1504402391.1", "_utmb":"30149280.2.9.1504402391","__utmc":"30149280","__utmt":"1", "__utmz":"30149280.1504402391.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)", "ap":"1","as":\'"https://movie.douban.com/subject/26363254/comments?start=225&limit=20&sort=new_score&status=P"\', "bid":"g7k4BGd2sRk","ck":"76vs","dbcl2":\'"166279730:fohmXhoM9uU"\',"ps":"y","push_doumail_num":"0", "push_doumail_num":"0"}
3. 抓取数据
import requests; import re; from GrabData import Param; import pandas as pd; from bs4 import BeautifulSoup; class GrabComent: ren = re.compile(r\'<span class="votes">(.*?)</span>.*?comment">.*?</span>.*?<span.*?class="">(.*?)</a>.*?<span>(.*?)</span>.*?title="(.*?)"></span>.*?title="(.*?)"><p .*? > (.*?)</p>\',re.S) def __init__(self): print(\'开始抓取数据\'); html = requests.get(Param.pdurl_first, headers=Param.head, cookies=Param.cookies); while html.status_code == 200: url_next = \'https://movie.douban.com/subject/26363254/comments\' + re.findall(Param.reg, html.text)[0] zhanlang = re.findall(self.ren, html.text) print(zhanlang) data = pd.DataFrame(zhanlang) data.to_csv(\'H:\\python_projects\\ticket\\zhanlangpinglun.csv\', header=False, index=False, mode=\'a+\') # 写入csv文件,\'a+\'是追加模式 data = [] zhanlang = [] print("下一页地址:"+url_next); html = requests.get(url_next, cookies=Param.cookies, headers=Param.head) if __name__ == \'__main__\': GrabComent();
4. 生成云图
from wordcloud import WordCloud #词云库 import matplotlib.pyplot as plt #数学绘图库 import jieba; class WordYun: def __init__(self): print("开始读取文件!"); self.main(); def main(self): text = self.readFile(); self.showTitle(text); def showTitle(self,text1): wc1 = WordCloud( background_color="white", width=1000, height=860, font_path="D:\\Windows\\Fonts\\STFANGSO.ttf", # 不加这一句显示口字形乱码 margin=2); wc2 = wc1.generate(text1) # 我们观察到generate()接受一个Unicode的对象,所以之前要把文本处理成unicode类型 plt.imshow(wc2) plt.axis("off") plt.show(); def readFile(self): a = [] f = open(r\'H:\\python_projects\\ticket\\zhanlangpinglun.csv\', \'r\').read() words = list(jieba.cut(f)) for word in words: if len(word) > 1: a.append(word); txt = r\' \'.join(a) print("readFile返回的结果:"+txt); return txt; if __name__ == \'__main__\': WordYun();