如何在 OpenNLP 中阅读命名实体识别文档答案

【问题标题】：How to read document for Named Entity Recognition in OpenNLP如何在 OpenNLP 中阅读命名实体识别文档
【发布时间】：2014-01-16 20:28:44
【问题描述】：

我是 java 新手，我的要求是阅读文档并执行命名实体文档。对于简单的字符串，我做了以下

InputStream is = new FileInputStream("data/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
String []sentence = new String[]{"Smith",
                "Smithosian",
                "is",
                "a",
                "person"
                };



   Span nameSpans[] = nameFinder.find(sentence);

但是，我需要从文档中实际读取流，然后生成 XML。谁能告诉我该怎么做

谢谢

【问题讨论】：

标签： java opennlp named-entity-recognition

【解决方案1】：

没有人回答这个问题，所以我希望现在还为时不晚。

对于实体提取，您需要具有字符串格式的文档文本。检查 stackoverflow 以了解将文档文本转换为 String 的多种方法（这里的简短回答是使用 BufferedInputStream 处理文本文件，或使用 Apache Tika 处理 MS 和 PDF 文件）

一旦您将文档文本保存在内存中，此代码应该可以帮助您进行句子边界检测、标记化和 NER。然后获取此结果并使用 docname/docid、可能是一些文件元数据、实际实体字符串、类型和 Span（NE 在文本中命中的位置）以任何您想要的方式生成 xmlDoc

这门课应该让你开始

package processors;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

public class OpenNLPNER implements Runnable
{

    static TokenizerModel tm = null;
    static TokenNameFinderModel locModel = null;
    String doc;
    NameFinderME myNameFinder;
    TokenizerME wordBreaker;
    SentenceDetector sd;

    public OpenNLPNER()
    {
    }

    public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker)
    {
        System.out.println("got doc");
        this.sd = sd;
        this.myNameFinder = mf;
        this.wordBreaker = wordBreaker;
        doc = document;
    }

    private static List<String> getMyDocsFromSomewhere()
    {
        //this should return an object that has all the info about the doc you want
        return new ArrayList<String>();
    }

    public static void main(String[] args)
    {
        try
        {
            String modelPath = "c:\\temp\\opennlpmodels\\";

            if (tm == null)
            {
                //user does normal namefinder instantiations...
                InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip"));
                // new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip"))));
                tm = new TokenizerModel(stream);
                // new TokenizerME(tm);
                locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin")));
                //  new NameFinderME(locModel);
            }


            System.out.println("getting data");
            List<String> docs = getMyDocsFromSomewhere();
            System.out.println("\tdone getting data");
            // FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt");




            for (String docu : docs)
            {
                //you could also use the runnable here and launch in a diff thread
                new OpenNLPNER(docu,
                        new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))),
                        new NameFinderME(locModel), new TokenizerME(tm)).run();

            }

            System.out.println("done");


        } catch (Exception ex)
        {
            System.out.println(ex);
        }


    }

    @Override
    public void run()
    {
        try
        {
            process(doc);
        } catch (Exception ex)
        {
            System.out.println(ex);
        }
    }

    public void process(String document) throws Exception
    {

        //  System.out.println(document);
        //user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use
        String modelPath = "C:\\apache\\entitylinker\\";


        //input document
        myNameFinder.clearAdaptiveData();
        //user splits doc to sentences
        String[] sentences = sd.sentDetect(document);
        //get the sentence spans
        Span[] sentenceSpans = sd.sentPosDetect(document);
        Span[][] allnamesInDoc = new Span[sentenceSpans.length][];
        String[][] allTokensInDoc = new String[sentenceSpans.length][];

        for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++)
        {
            String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]);
            Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]);
            Span[] spans = myNameFinder.find(stringTokens);
            allnamesInDoc[sentenceIndex] = spans;
            allTokensInDoc[sentenceIndex] = stringTokens;
        }

        //now access the data like this...
        for (int s = 0; s < sentenceSpans.length; s++)
        {
            Span[] namesInSentence = allnamesInDoc[s];
            String[] tokensInSentence = allTokensInDoc[s];
            String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence);
            for (String entity : entities)
            {
                //start building up the XML here....
                System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString());
            }
        }

    }
}

【讨论】：