【问题标题】:How to read document for Named Entity Recognition in OpenNLP如何在 OpenNLP 中阅读命名实体识别文档
【发布时间】:2014-01-16 20:28:44
【问题描述】:

我是 java 新手,我的要求是阅读文档并执行命名实体文档。对于简单的字符串,我做了以下

InputStream is = new FileInputStream("data/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
String []sentence = new String[]{"Smith",
                "Smithosian",
                "is",
                "a",
                "person"
                };



   Span nameSpans[] = nameFinder.find(sentence);

但是,我需要从文档中实际读取流,然后生成 XML。 谁能告诉我该怎么做

谢谢

【问题讨论】:

    标签: java opennlp named-entity-recognition


    【解决方案1】:

    没有人回答这个问题,所以我希望现在还为时不晚。

    对于实体提取,您需要具有字符串格式的文档文本。检查 stackoverflow 以了解将文档文本转换为 String 的多种方法(这里的简短回答是使用 BufferedInputStream 处理文本文件,或使用 Apache Tika 处理 MS 和 PDF 文件)

    一旦您将文档文本保存在内存中,此代码应该可以帮助您进行句子边界检测、标记化和 NER。然后获取此结果并使用 docname/docid、可能是一些文件元数据、实际实体字符串、类型和 Span(NE 在文本中命中的位置)以任何您想要的方式生成 xmlDoc

    这门课应该让你开始

    package processors;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStream;
    import java.util.ArrayList;
    import java.util.List;
    import opennlp.tools.namefind.NameFinderME;
    import opennlp.tools.namefind.TokenNameFinderModel;
    import opennlp.tools.sentdetect.SentenceDetector;
    import opennlp.tools.sentdetect.SentenceDetectorME;
    import opennlp.tools.sentdetect.SentenceModel;
    import opennlp.tools.tokenize.TokenizerME;
    import opennlp.tools.tokenize.TokenizerModel;
    import opennlp.tools.util.Span;
    
    public class OpenNLPNER implements Runnable
    {
    
        static TokenizerModel tm = null;
        static TokenNameFinderModel locModel = null;
        String doc;
        NameFinderME myNameFinder;
        TokenizerME wordBreaker;
        SentenceDetector sd;
    
        public OpenNLPNER()
        {
        }
    
        public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker)
        {
            System.out.println("got doc");
            this.sd = sd;
            this.myNameFinder = mf;
            this.wordBreaker = wordBreaker;
            doc = document;
        }
    
        private static List<String> getMyDocsFromSomewhere()
        {
            //this should return an object that has all the info about the doc you want
            return new ArrayList<String>();
        }
    
        public static void main(String[] args)
        {
            try
            {
                String modelPath = "c:\\temp\\opennlpmodels\\";
    
                if (tm == null)
                {
                    //user does normal namefinder instantiations...
                    InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip"));
                    // new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip"))));
                    tm = new TokenizerModel(stream);
                    // new TokenizerME(tm);
                    locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin")));
                    //  new NameFinderME(locModel);
                }
    
    
                System.out.println("getting data");
                List<String> docs = getMyDocsFromSomewhere();
                System.out.println("\tdone getting data");
                // FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt");
    
    
    
    
                for (String docu : docs)
                {
                    //you could also use the runnable here and launch in a diff thread
                    new OpenNLPNER(docu,
                            new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))),
                            new NameFinderME(locModel), new TokenizerME(tm)).run();
    
                }
    
                System.out.println("done");
    
    
            } catch (Exception ex)
            {
                System.out.println(ex);
            }
    
    
        }
    
        @Override
        public void run()
        {
            try
            {
                process(doc);
            } catch (Exception ex)
            {
                System.out.println(ex);
            }
        }
    
        public void process(String document) throws Exception
        {
    
            //  System.out.println(document);
            //user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use
            String modelPath = "C:\\apache\\entitylinker\\";
    
    
            //input document
            myNameFinder.clearAdaptiveData();
            //user splits doc to sentences
            String[] sentences = sd.sentDetect(document);
            //get the sentence spans
            Span[] sentenceSpans = sd.sentPosDetect(document);
            Span[][] allnamesInDoc = new Span[sentenceSpans.length][];
            String[][] allTokensInDoc = new String[sentenceSpans.length][];
    
            for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++)
            {
                String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]);
                Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]);
                Span[] spans = myNameFinder.find(stringTokens);
                allnamesInDoc[sentenceIndex] = spans;
                allTokensInDoc[sentenceIndex] = stringTokens;
            }
    
            //now access the data like this...
            for (int s = 0; s < sentenceSpans.length; s++)
            {
                Span[] namesInSentence = allnamesInDoc[s];
                String[] tokensInSentence = allTokensInDoc[s];
                String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence);
                for (String entity : entities)
                {
                    //start building up the XML here....
                    System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString());
                }
            }
    
        }
    }
    

    【讨论】:

      猜你喜欢
      • 2015-03-16
      • 2011-10-20
      • 2013-02-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2014-03-17
      • 1970-01-01
      相关资源
      最近更新 更多