中文手机评论情感分类系列（一）

因为写论文需要，准备做手机评论的情感分析，依据现有的工具可以很容易中文评论进行去重复评论，分词，去停用词，向量化，然后用sklearn中的一款分类器来对数据进行情感分类。但是，由于本人打算依据手机不同的属性对评论文本进行情感分析，如“电池”，‘相机’，‘处理器’等属性的评论。很难过的是，这些针对手机不同属性的评论从互联网上爬取不到，只能找到综合的评论，即不根据手机特征分类的评论。天猫盒京东上的都只是综合评论。所以这里针对这些文本要进行多分类，即按属性分类。我主要把手机分为7个比较重要的特征。分别是，‘相机’，‘处理器’，‘价格’，‘售后服务’，‘续航’，‘外观’和‘性能’。这里涉及到了对评论进行多分类到了，我才用基于属性词典的分类方式，目前市面上没有关于手机各属性的词典，所以涉及到自己构建属性词典。本文分三个部分：

1.词典构建

2.文本基于属性词典的分类

3.评论文本的情感分析

本文介绍第一部分，属性词典的构建。直接放代码。

\'\'\'字典构建\'\'\'
import re,os,pyltp
import pandas as pd
import gensim
import time
from gensim.models.word2vec import PathLineSentences
import numpy as np
from sklearn.cluster import KMeans

class BulidDict():
    def __init__(self):
        self.a=1

    \'\'\'计算相似度大于0.8的词放入集合\'\'\'
    def similarity_word(self,list,p=0.8):
        model = gensim.models.Word2Vec.load(\'D:/machinelearning data/word2vec/phone_comment_vec_mini_count_5\')
        word_list = []
        for w in list:
            listA=model.most_similar(w,topn=2000)
            print(w,listA)
            for i,j in listA:
                if j > p:
                    print(i)
                    word_list.append(i)
            #word_list+=word_list
        return set(word_list)

    \'\'\'将词存储起来\'\'\'
    def save(self,list,save_path):
        f = open(save_path, \'w\', encoding=\'utf-8\')
        for i in list:
            f.write(i + \'\n\')

    def openFile(self,path):
        with open(path,\'r\',encoding=\'utf-8\') as f:
            for word in f.readlines():
                yield word.strip()

    \'\'\'构建属性字典集合\'\'\'
    def build_dict(self,loadPath,savePath,p):
        wordSet = list(self.openFile(loadPath))
        simWord=self.similarity_word(wordSet,p)
        self.save(simWord,savePath)

    \'\'\'去除字典中的重复词,并保存\'\'\'
    def del_repetition(self,file_path,save_path):
        file=self.openFile(file_path)
        self.save(set(list(file)),save_path)

if __name__==\'__main__\'  :
    path1 = \'D:/machinelearning data/buildDict/camera.txt\'
    path2 = \'D:/machinelearning data/buildDict/processor.txt\'
    path3 = \'D:/machinelearning data/buildDict/pricemin.txt\'
    path4 = \'D:/machinelearning data/buildDict/performance.txt\'
    path5= \'D:/machinelearning data/buildDict/serve.txt\'
    path6 = \'D:/machinelearning data/buildDict/appearance.txt\'
    path7 = \'D:/machinelearning data/buildDict/endurance.txt\'
    demo=BulidDict()
    # set1=list(demo.openFile(path1))
    # list1=demo.similarity_word(set1,p=0.92)
    # demo.save(list1,\'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\相机0.9.txt\')
    savePath1=\'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\相机0.85.txt\'
    savePath2 = \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\处理器0.89.txt\'
    savePath3= \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\价格.txt\'
    savePath4 = \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\性能0.93.txt\'
    savePath5 = \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\售后0.85.txt\'
    savePath6 = \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\外观0.9.txt\'
    savePath7 = \'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\续航电池0.7.txt\'
    #demo.build_dict(path2,savePath2,0.89)#0.89最优
    #demo.build_dict(path3, savePath3, 0.6)
    #demo.build_dict(path4, savePath4, 0.93)#0.89+0.9+0.93最优
    #demo.build_dict(path5, savePath5, 0.85)#0.85最优
    #demo.build_dict(path6, savePath6, 0.9)#0.9最优
    #demo.build_dict(path1, savePath1, 0.85)#0.85和基础词典已经最优
    #demo.build_dict(path7, savePath7, 0.7)#电池0.7其他0.85最优

    \'\'\'字典的去重复工作和统一存储工作\'\'\'
    abs_path=\'D:\\论文文件\\阅读论文\\写论文准备\\字典构建\\手机属性词典\\dictionary_0_2\\\'
    save_name=[\'相机.txt\',\'处理器.txt\',\'价格.txt\',\'性能.txt\',\'售后.txt\',\'外观.txt\',\'续航.txt\']
    open_name=[\'相机0.85.txt\',\'处理器0.89(完美).txt\',\'价格0.9+0.95+0.6(完美).txt\',\'性能0.89+0.9+0.93(完美).txt\',\'售后0.85(最优).txt\',\'外观0.9(最优).txt\',\'续航0.85+电池0.7(完美).txt\']

    \'\'\'去除字典的重复词\'\'\'
    for i in range(len(save_name)):#这个打开txt文件需要open_File()的编码方式改为\'utf-8\'
        demo.del_repetition(abs_path+open_name[i],abs_path+save_name[i])