创建一个给句子打分的函数,然后应用到文本列:
import pandas as pd
# import data
text, number = zip(
("very nice house, and great garden", 3),
("the book is very boring", 4),
("it was very interesting final end", 5),
("I have no idea which book do you prefer", 4),
)
df = pd.DataFrame(dict(text=text, number=number))
word, score = zip(
("boring", -1.0),
("very", -1.0),
("interesting", 1.0),
("great", 1.0),
("book", 0.5),
)
df2 = pd.DataFrame(dict(word=word, score=score))
# convert score data frame to a dictionary for faster indexing
word2score = dict(zip(df2['word'], df2['score']))
def score_text(sentence):
score = 0
for word in sentence.split():
token = word.strip(",.:;!?()'/") # you probably want to do a more professional tokenization here
if token in word2score:
score += word2score[token]
return score
df['score'] = df['text'].apply(score_text)
print(df)
# text number score
# 0 very nice house, and great garden 3 0.0
# 1 the book is very boring 4 -1.5
# 2 it was very interesting final end 5 0.0
# 3 I have no idea which book do you prefer 4 0.5
编辑:
如果要统计正负词的个数,就得对打分函数做一些小改动:
def score_text(sentence):
score = [0, 0]
for word in sentence.split():
token = word.strip(",.:;!?()'/") # you probably want to do a more professional tokenization here
if token in word2score:
if word2score[token] > 0:
score[0] += 1
elif word2score[token] < 0:
score[1] += 1
return score
# text number score
# 0 very nice house, and great garden 3 [1, 1]
# 1 the book is very boring 4 [1, 2]
# 2 it was very interesting final end 5 [1, 1]
# 3 I have no idea which book do you prefer 4 [1, 0]