Python实现基于词典的文本情感分析

情感分析目前比较流行的方法有两种,一是词库法、二是机器学习法。机器学习法则是在已知分类语料的情况下,构建文档–词条矩阵,然后应该各种分类算法(knn、NB、RF、SVM、DL等),预测出其句子的情感。

通过词库的方式定性每一句话的情感没有什么高深的理论基础,其思想就是对每一句话进行分词,然后对比正面词库与负面词库,从而计算出句子的正面得分(词中有多少是正面的)与负面得分(词中有多少是负面的),以及综合得分(正面得分-负面得分)。虽然该方法通俗易懂,但是非常耗人力成本,如正负面词库的构建、自定义词典的导入等。

以某汽车的空间评论数据作为分析对象,来给每条评论打上正面或负面的标签:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
"""
Created on Sun May 14 16:04:08 2017
wordcloud required C++ 14.0
running on python 3.5
@author: wangmin
"""
import jieba
import collections
import numpy as np
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
# 读入评论数据,正负情感词典并合并
evaluation = []
stopwords = []
pos = []
neg = []
mydict = []
infile = open("evaluation.csv", 'r')
for line in infile:
data = line.rstrip().split(',')
evaluation.append(data[1])
del evaluation[0]
infile = open("negative.csv", 'r')
for line in infile:
data = line.rstrip().split(',')
neg.append(data[1])
infile = open("positive.csv", 'r')
for line in infile:
data = line.rstrip().split(',')
pos.append(data[1])
mydict = pos + neg
file = open("stopwords.csv", 'r')
for s in file:
data = s.rstrip().split(',')
stopwords.append(data[1])
# 对每条评论分词,并保存分词结果
eva = []
for i in range(len(evaluation)):
seg_list = jieba.cut(evaluation[i], cut_all=False)
seg_list = list(seg_list)
eva.append(seg_list)
# 删除一个字的词
new_eva = eva
tmp = []
t = 0
for j in range(3321):
for k in range(len(eva[j])):
if len(eva[j][k]) >= 2:
tmp.append(eva[j][k])
new_eva[t] = tmp
tmp = []
t=t+1
# 删除停止词(对分析没有意义的词)
#for word in stopwords:
# 自定义情感类型得分函数
def GetScore(list):
neg_s = 0
pos_s = 0
for w in list:
if (w in neg) == True:
neg_s = neg_s + 1
elif (w in pos) == True:
pos_s = pos_s + 1
if (neg_s-pos_s) > 0:
score = 'NEGATIVE'
return score
elif (neg_s-pos_s) < 0:
score = 'POSITIVE'
return score
else:
score = 'NEUTRAL'
return score
# 计算每条评论的正负得分
Score = []
for l in range(len(new_eva)):
Score.append(GetScore(new_eva[l]))
'''
def find_all_index(arr,item):
return [i for i,a in enumerate(arr) if a==item]
NEG=find_all_index(Score,'NEGATIVE')
POS=find_all_index(Score,'POSITIVE')
NEU=find_all_index(Score,'NEUTRAL')
print(len(NEG))
print(len(POS))
print(len(NEU))
'''
# 统计词频
wf = {}
for p in range(len(new_eva)):
for word in new_eva[p]:
if word not in wf:
wf[word]=0
wf[word]+=1
def Sort_by_count(d):
d = collections.OrderedDict(sorted(d.items(),key = lambda t: -t[1]))
return d
wf = Sort_by_count(wf)
top_key = []
top_word = []
for key in wf.items():
top_key.append(key)
top_word = top_key[1:51]
print(top_key[0:49])
#for key,values in wf.items():
# print(key + "%d" % values)
# 绘制词云
word_space_split = 'a'
for i in range(3322):
new_eva[i] = " ".join(new_eva[i])
word_space_split += new_eva[i]
word_space_split = word_space_split.replace('word','')
# mask = np.array(Image.open('C:/Users/wangmin/Pictures/aaa/abc.png'))
wc = WordCloud(font_path='C:\Windows\Fonts\STSONG.TTF',#设置字体
background_color="black", #背景颜色
scale=5,
margin=1,
stopwords = stopwords, #设置停用词
max_words=50,# 词云显示的最大词数
max_font_size=150, #字体最大值
random_state=30,) # 设置有多少种随机生成状态,即有多少种配色方案
wc.generate(word_space_split)
#image_colors = ImageColorGenerator(abel_mask)
# 显示图片
plt.imshow(wc)
plt.axis("off")
plt.show()

通过文字云绘制结果可以判断,消费者还是非常认可该款汽车的空间大小,普遍表示满意。
img

数据及源码地址:链接: https://pan.baidu.com/s/1o8QBDOI 密码: 2xw8

如果文章对您有用请随意打赏,谢谢支持!
0%