【问题标题】:Python NLTK not sentiment calculate correctPython NLTK 不情绪计算正确
【发布时间】:2013-11-06 12:02:32
【问题描述】:

我确实有一些正面和负面的句子。我想非常简单地使用 Python NLTK 来训练 NaiveBayesClassifier 来调查其他句子的情绪。

我尝试使用此代码,但我的结果始终是肯定的。 http://www.sjwhitworth.com/sentiment-analysis-in-python-using-nltk/

我是python的新手,所以我在复制代码时会出错。

import nltk
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')

from nltk.corpus import stopwords

__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))

postweet = __location__ + "/postweet.txt"
negtweet = __location__ + "/negtweet.txt"


customstopwords = ['band', 'they', 'them']

#Load positive tweets into a list
p = open(postweet, 'r')
postxt = p.readlines()

#Load negative tweets into a list
n = open(negtweet, 'r')
negtxt = n.readlines()

neglist = []
poslist = []

#Create a list of 'negatives' with the exact length of our negative tweet list.
for i in range(0,len(negtxt)):
    neglist.append('negative')

#Likewise for positive.
for i in range(0,len(postxt)):
    poslist.append('positive')

#Creates a list of tuples, with sentiment tagged.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)

#Combines all of the tagged tweets to one large list.
taggedtweets = postagged + negtagged

tweets = []

#Create a list of words in the tweet, within a tuple.
for (word, sentiment) in taggedtweets:
    word_filter = [i.lower() for i in word.split()]
    tweets.append((word_filter, sentiment))

#Pull out all of the words in a list of tagged tweets, formatted in tuples.
def getwords(tweets):
    allwords = []
    for (words, sentiment) in tweets:
        allwords.extend(words)
    return allwords

#Order a list of tweets by their frequency.
def getwordfeatures(listoftweets):
    #Print out wordfreq if you want to have a look at the individual counts of words.
    wordfreq = nltk.FreqDist(listoftweets)
    words = wordfreq.keys()
    return words

#Calls above functions - gives us list of the words in the tweets, ordered by freq.
print getwordfeatures(getwords(tweets))

wordlist = [] 
wordlist = [i for i in wordlist if not i in stopwords.words('english')]
wordlist = [i for i in wordlist if not i in customstopwords]

def feature_extractor(doc):
    docwords = set(doc)
    features = {}
    for i in wordlist:
        features['contains(%s)' % i] = (i in docwords)
    return features

#Creates a training set - classifier learns distribution of true/falses in the input.
training_set = nltk.classify.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

print classifier.show_most_informative_features(n=30)

while True:
    input = raw_input('ads')
    if input == 'exit':
        break
    elif input == 'informfeatures':
        print classifier.show_most_informative_features(n=30)
        continue
    else:
        input = input.lower()
        input = input.split()
        print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'

p.close()
n.close()

这只是代码错误吗?或者是什么问题。 当问题开始时,它应该打印出 print classifier.show_most_informative_features(n=30) 但我得到的结果是 Most Informative Features 没有

如果这能给出提示,不要这样做。

谢谢

【问题讨论】:

    标签: python nltk bayesian sentiment-analysis


    【解决方案1】:

    致所有对使用 NLTK 进行情感分析感兴趣的人。这是完整的工作代码。感谢@NLPer

    import nltk
    import math
    import re
    import sys
    import os
    import codecs
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    from nltk.corpus import stopwords
    
    __location__ = os.path.realpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))
    
    postweet = __location__ + "/postweet.txt"
    negtweet = __location__ + "/negtweet.txt"
    
    
    customstopwords = ['band', 'they', 'them']
    
    #Load positive tweets into a list
    p = open(postweet, 'r')
    postxt = p.readlines()
    
    #Load negative tweets into a list
    n = open(negtweet, 'r')
    negtxt = n.readlines()
    
    neglist = []
    poslist = []
    
    #Create a list of 'negatives' with the exact length of our negative tweet list.
    for i in range(0,len(negtxt)):
        neglist.append('negative')
    
    #Likewise for positive.
    for i in range(0,len(postxt)):
        poslist.append('positive')
    
    #Creates a list of tuples, with sentiment tagged.
    postagged = zip(postxt, poslist)
    negtagged = zip(negtxt, neglist)
    
    #Combines all of the tagged tweets to one large list.
    taggedtweets = postagged + negtagged
    
    tweets = []
    
    #Create a list of words in the tweet, within a tuple.
    for (word, sentiment) in taggedtweets:
        word_filter = [i.lower() for i in word.split()]
        tweets.append((word_filter, sentiment))
    
    #Pull out all of the words in a list of tagged tweets, formatted in tuples.
    def getwords(tweets):
        allwords = []
        for (words, sentiment) in tweets:
            allwords.extend(words)
        return allwords
    
    #Order a list of tweets by their frequency.
    def getwordfeatures(listoftweets):
        #Print out wordfreq if you want to have a look at the individual counts of words.
        wordfreq = nltk.FreqDist(listoftweets)
        words = wordfreq.keys()
        return words
    
    #Calls above functions - gives us list of the words in the tweets, ordered by freq.
    print getwordfeatures(getwords(tweets))
    
    wordlist = getwordfeatures(getwords(tweets))
    wordlist = [i for i in wordlist if not i in stopwords.words('english')]
    wordlist = [i for i in wordlist if not i in customstopwords]
    
    def feature_extractor(doc):
        docwords = set(doc)
        features = {}
        for i in wordlist:
            features['contains(%s)' % i] = (i in docwords)
        return features
    
    #Creates a training set - classifier learns distribution of true/falses in the input.
    training_set = nltk.classify.apply_features(feature_extractor, tweets)
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    
    print classifier.show_most_informative_features(n=30)
    
    while True:
        input = raw_input('ads')
        if input == 'exit':
            break
        elif input == 'informfeatures':
            print classifier.show_most_informative_features(n=30)
            continue
        else:
            input = input.lower()
            input = input.split()
            print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
    
    p.close()
    n.close()
    

    【讨论】:

      【解决方案2】:

      wordList 为空。它应该分配给 getwordfeatures(getwords(tweets))。

      以下两行:

      wordlist = [i for i in wordlist if not i in stopwords.words('english')]

      wordlist = [i for i in wordlist if not i in customstopwords]

      是“非此即彼”;您可以尝试哪个停用词列表效果更好。

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2019-04-11
        • 2015-04-27
        • 1970-01-01
        • 1970-01-01
        • 2014-10-17
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多