答案是用Additional TokensRegexNER Rules 制作一个规则文件。
我使用正则表达式将带标签的名称分组。从这里我构建了一个规则临时文件,我用-ner.additional.regexner.mapping mytemprulesfile 将它传递给corenlp jar。
Alexander III of Macedon PERSON PERSON,LOCATION,ORGANIZATION,MISC
Aristotle PERSON PERSON,LOCATION,ORGANIZATION,MISC
Anatolia LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Alexander PERSON PERSON,LOCATION,ORGANIZATION,MISC
Persia LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Issus LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Gaugamela LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Persian King Darius III PERSON PERSON,LOCATION,ORGANIZATION,MISC
Achaemenid Empire ORGANIZATION PERSON,LOCATION,ORGANIZATION,MISC
为了便于阅读,我已对齐此列表,但这些是制表符分隔的值。
一个有趣的发现是,一些多词预先标记的实体保持最初标记的多词,而在没有规则文件的情况下运行 corenlp 有时会将这些标记拆分为单独的实体。
我曾想专门识别命名实体标记,认为它会使共同引用更容易,但我想现在就可以了。无论如何,在一个文档中实体名称相同但不相关的频率是多少?
示例 (执行大约需要 70 秒)
import os, re, tempfile, json, nltk, pprint
from subprocess import PIPE
from nltk.internals import (
find_jar_iter,
config_java,
java,
_java_options,
find_jars_within_path,
)
def ExtractLabeledEntitiesByRegex( text, regex ):
rgx = re.compile(regex)
nelist = []
for mobj in rgx.finditer( text ):
ne = mobj.group('ner')
try:
tag = mobj.group('tag')
except IndexError:
tag = 'PERSON'
mstr = text[mobj.start():mobj.end()]
nelist.append( (ne,tag,mstr) )
cleantext = rgx.sub("\g<ner>", text)
return (nelist, cleantext)
def GenerateTokensNERRules( nelist ):
rules = ""
for ne in nelist:
rules += ne[0]+'\t'+ne[1]+'\tPERSON,LOCATION,ORGANIZATION,MISC\n'
return rules
def GetEntities( origtext ):
nelist, cleantext = ExtractLabeledEntitiesByRegex( origtext, '(\[(?P<tag>[a-zA-Z]+)\:\s*)(?P<ner>(\s*\w)+)(\s*\])' )
origfile = tempfile.NamedTemporaryFile(mode='r+b', delete=False)
origfile.write( cleantext.encode('utf-8') )
origfile.flush()
origfile.seek(0)
nerrulefile = tempfile.NamedTemporaryFile(mode='r+b', delete=False)
nerrulefile.write( GenerateTokensNERRules(nelist).encode('utf-8') )
nerrulefile.flush()
nerrulefile.seek(0)
java_options='-mx4g'
config_java(options=java_options, verbose=True)
stanford_jar = '../stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar'
stanford_dir = os.path.split(stanford_jar)[0]
_classpath = tuple(find_jars_within_path(stanford_dir))
cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLP',
'-annotators','tokenize,ssplit,pos,lemma,ner,parse,coref,coref.mention,depparse,natlog,openie,relation',
'-ner.combinationMode','HIGH_RECALL',
'-ner.additional.regexner.mapping',nerrulefile.name,
'-coref.algorithm','neural',
'-outputFormat','json',
'-file',origfile.name
]
# java( cmd, classpath=_classpath, stdout=PIPE, stderr=PIPE )
stdout, stderr = java( cmd, classpath=_classpath, stdout=PIPE, stderr=PIPE ) # Couldn't get working- stdin=textfile
PrintJavaOutput( stdout, stderr )
origfilenametuple = os.path.split(origfile.name)
jsonfilename = origfilenametuple[len(origfilenametuple)-1] + '.json'
os.unlink( origfile.name )
os.unlink( nerrulefile.name )
origfile.close()
nerrulefile.close()
with open( jsonfilename ) as jsonfile:
jsondata = json.load(jsonfile)
currentid = 0
entities = []
for sent in jsondata['sentences']:
for thisentity in sent['entitymentions']:
tag = thisentity['ner']
if tag == 'PERSON' or tag == 'LOCATION' or tag == 'ORGANIZATION':
entity = {
'id':currentid,
'label':thisentity['text'],
'tag':tag
}
entities.append( entity )
currentid += 1
return entities
#### RUN ####
corpustext = "During his youth, [PERSON:Alexander III of Macedon] was tutored by [PERSON: Aristotle] until age 16. Following the conquest of [LOCATION: Anatolia], [PERSON: Alexander] broke the power of [LOCATION: Persia] in a series of decisive battles, most notably the battles of [LOCATION: Issus] and [LOCATION: Gaugamela]. He subsequently overthrew [PERSON: Persian King Darius III] and conquered the [ORGANIZATION: Achaemenid Empire] in its entirety."
entities = GetEntities( corpustext )
for thisent in entities:
pprint.pprint( thisent )
输出
{'id': 0, 'label': 'Alexander III of Macedon', 'tag': 'PERSON'}
{'id': 1, 'label': 'Aristotle', 'tag': 'PERSON'}
{'id': 2, 'label': 'his', 'tag': 'PERSON'}
{'id': 3, 'label': 'Anatolia', 'tag': 'LOCATION'}
{'id': 4, 'label': 'Alexander', 'tag': 'PERSON'}
{'id': 5, 'label': 'Persia', 'tag': 'LOCATION'}
{'id': 6, 'label': 'Issus', 'tag': 'LOCATION'}
{'id': 7, 'label': 'Gaugamela', 'tag': 'LOCATION'}
{'id': 8, 'label': 'Persian King Darius III', 'tag': 'PERSON'}
{'id': 9, 'label': 'Achaemenid Empire', 'tag': 'ORGANIZATION'}
{'id': 10, 'label': 'He', 'tag': 'PERSON'}