词义消歧在nlp中是重要的核心问题之一,词义、句义以及篇章含义层次都会根据不同的上下文环境产生不同的意义,消歧就是指根据上下文确定对象语义的过程。词义消歧是在词语层次上的语义消歧,常常在搜索引擎、意见挖掘、文本理解与产生、推理中具有应用。
一、常用的算法
1、监督学习算法
a.确定词表和释义表,如目标词“bass”,有两个释义:乐器-贝斯,鱼类-鲈鱼;
b.获取语料:Google、百度
c.特征提取:一般先设定一个窗口,只关心这个窗口的词。
d.分类器选择:朴素贝叶斯、逻辑回归、SVM、KNN、神经网络
例如:基于贝叶斯分类
任何的多义词含义都跟上下文语境相关。假设语境(context)为c,语义为s则
P(s|c)=p(c|s)*p(s)/p(c),可以根据大量的语料统计,从而计算得到结果。
2、半监督学习算法
当目标词没有足够的语料的时候,从少量手动标注启动,按照同一共现释义中,不同词出现频率进行扩展,如bass的鲈鱼解释一般与fish共现;乐器贝斯解释一般与play共现。因此可以标注所有<fish, bass> 和 <play, bass>的语句。
3、无监督学习算法
一种贝叶斯分类器,参数估计不是基于有标注的训练语料,而是先随机初始化参数p(v|s),根据EM算法重新估计概率值,对每个词义上下文计算得到p(c|s),不断迭代从而得到最终分类的模型,最终利用余弦相似性计算得到结果。
4、其他方法
如基于语义角色标注、依存句法分析,可以对某些问题得到一个比较好的结果。
二、本文算法描述
主要通过百度百科,将待输入的消歧词进行查询,从而得到相关义项,并将消歧句子与各个义项所表示的句子进行相似性计算,从而得到与之相关的该消歧词的意思,有点远程监督的味道。
具体代码如下:
import os
from urllib import request
from lxml import etree
from urllib import parse
import jieba.posseg as pseg
import jieba.analyse as anse
import numpy as np
embedding_size =300
embedding_path = "D:\workspace\project\\NLPcase\\mutilSenmanticWord\\data\\wrod_vec_300.bin"
sim_limit = 0.8
def get_html(url):
'''根据url,获取html页面'''
return request.urlopen(url).read().decode('utf-8').replace(' ','')
def collect_mutilsens(word):
'''根据单词,到百度百科上面去查询各种语义相关的句子'''
url = "http://baike.baidu.com/item/%s?force=1"%parse.quote(word)#parser.quote 对含有特殊符号的URL进行编码,使其转换为合法的url字符串
html = get_html(url)
selector = etree.HTML(html)
sens = [ ''.join(i.split(':'))for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/text()')]
sens_link = ['http://baike.baidu.com' + i for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/@href')]
sens_dict = {sens[i]:sens_link[i] for i in range(len(sens))}
return sens_dict
def extract_concept(desc):
'''概念抽取'''
desc_seg = [[i.word,i.flag] for i in pseg.cut(desc)]
concepts_candi = [i[0] for i in desc_seg if i[1] in ['n','b','v','d']]
return concepts_candi[-1]
def entity_clusters(s):
'''对具有联通边的实体进行聚类'''
clusters = []
for i in range(len(s)):
cluster = s[i]
for j in range(len(s)):
if set(s[i]).intersection(set(s[j])) and set(s[i]).intersection(set(cluster)) and set(
s[j]).intersection(set(cluster)):
cluster += s[i]
cluster += s[j]
if set(cluster) not in clusters:
clusters.append(set(cluster))
return clusters
def similarity_cosine(vector1, vector2):
'''计算问句与库中问句的相似度,对候选结果加以二次筛选'''
cos1 = np.sum(vector1*vector2)
cos21 = np.sqrt(sum(vector1**2))
cos22 = np.sqrt(sum(vector2**2))
similarity = cos1/float(cos21*cos22)
if str(similarity) == 'nan':
return 0.0
else:
return similarity
def get_wordvector(word):
'''获取单个词的词向量'''
return np.array(embdding_dict.get(word, [0]*embedding_size))
def load_embedding(embedding_path):
'''加载词向量'''
embedding_dict = {}
count = 0
for line in open(embedding_path):
line = line.strip().split(' ')
if len(line) < 300:
continue
wd = line[0]
vector = np.array([float(i) for i in line[1:]])
embedding_dict[wd] = vector
count += 1
if count%10000 == 0:
print(count, 'loaded')
print('loaded %s word embedding, finished'%count)
return embedding_dict
embdding_dict = load_embedding(embedding_path)
def concept_cluster(concept_dict):
'''词的义项聚类'''
sens_list = []
cluster_sens_dict = {}
for sen1 in concept_dict:
sen1_list = [sen1]
for sen2 in concept_dict:
if sen1 == sen2:
continue
sim_score = similarity_cosine(get_wordvector(sen1),get_wordvector(sen2))
if sim_score>= sim_limit:
sen1_list.append(sen2)
sens_list.append(sen1_list)
sens_clusters = entity_clusters(sens_list)
for sens in sens_clusters:
symbol_sen = list(sens)[0]
cluster_sens_dict[symbol_sen] = concept_dict[symbol_sen]
return cluster_sens_dict
def extract_desc(link):
'''获取该义项的描述信息,作为该义项的意义描述'''
html = get_html(link)
selector = etree.HTML(html)
keywords = selector.xpath('//meta[@name="keywords"]/@content')
desc = selector.xpath('//meta[@name="description"]/@content')
return desc,keywords
def collect_concepts(wd):
'''多义词主函数'''
sens_dict = collect_mutilsens(wd)
if not sens_dict:
return {}
concepts_dict = {}
concept_dict = {}
for sen,link in sens_dict.items():
concept = extract_concept(sen)
if concept not in concept_dict:
concept_dict[concept] = [link]
else:
concept_dict[concept].append(link)
# 对抽取的概念进行聚合
cluster_concept_dict = concept_cluster(concept_dict)
for concept,links in cluster_concept_dict.items():
# 获取对应义项的连接页面内容,并进行处理
link = links[0]
desc, keywords = extract_desc(link)
context = ''.join(desc+[' '+keywords])# 将两个内容进行合并,作为其描述
concepts_dict[concept] = context
return concepts_dict
#------------------------------------------句子的语义级别表示----------------------
# 对义项的描述信息进行关键词提取,作为该义项的一个结构化表示
def extract_keywords(sent):
keywords = [i for i in anse.extract_tags(sent,topK=20,withWeight=False)]# 结巴的关键词提取
return keywords
# 基于word2vector,通过lookup table的方式找到句子的wordvector的表示
def rep_sentVector(sent):
word_list = extract_keywords(sent)
embedding = np.zeros(embedding_size)
sent_len = 0
for index, wd in enumerate(word_list):
if wd in embdding_dict:
embedding += embdding_dict.get(wd)# 通过求和的方式表示该句子
sent_len += 1
else:
continue
return embedding/sent_len
# 基于词语相似度,计算句子的相似度
def distance_words(sent1,sent2):
wds1 = extract_keywords(sent1)
wds2 = extract_keywords(sent2)
score_wds1 = []
score_wds2 = []
for word1 in wds1:
score = max([similarity_cosine(get_wordvector(word1),get_wordvector(word2)) for word2 in wds2])
score_wds1.append(score)
for word2 in wds2:
score = max([similarity_cosine(get_wordvector(word2),get_wordvector(word1)) for word1 in wds1])
score_wds2.append(score)
sim_score = max(sum(score_wds1)/len(wds1),sum(score_wds2)/len(wds2))
return sim_score
#-----------------------对该词进行消歧---------------
def detect_main(sent,word):
sent = sent.replace(word,'')
# 多义词的获取
concept_dict = collect_concepts(word)
# 待消句子的表示
sent_vector = rep_sentVector(sent)
concept_scores_sent = {}
concept_scores_wds = {}
for concept, desc in concept_dict.items():
concept_vector = rep_sentVector(desc)
similarity_sent = similarity_cosine(sent_vector,concept_vector)
concept_scores_sent[concept] = similarity_sent
similarity_wds = distance_words(desc,sent)#另一种计算相似度的方法
concept_scores_wds[concept] = similarity_wds
concept_scores_sent = sorted(concept_scores_sent.items(),key=lambda asd:asd[1],reverse=True)
concept_scores_wds = sorted(concept_scores_wds.items(),key=lambda asd:asd[1],reverse=True)
return concept_scores_wds,concept_scores_sent
三、参考资料
https://blog.csdn.net/liguochao1001/article/details/86596183
http://www.xjishu.com/zhuanli/55/201810179896.html
https://blog.csdn.net/weixin_38776853/article/details/79522149
https://blog.csdn.net/Uwr44UOuQcNsUQb60zk2/article/details/81074410
https://github.com/liuhuanyong/WordMultiSenseDisambiguation/blob/master/wordsense_detect.py