使用 Lucene 3.1 索引和搜索 MS excel答案

【问题标题】：Indexing and searching a MS excel using Lucene 3.1使用 Lucene 3.1 索引和搜索 MS excel
【发布时间】：2011-11-29 01:58:00
【问题描述】：

我有一张 MS Excel 表格，包含以下列

title,cast,director,genre.

使用 jxl 库解析 Excel 工作表。索引工作正常，但是当我搜索时，我总是找到 0 次点击。我不知道我哪里出错了。代码如下：

import java.io.File;
import java.io.IOException;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class ExcelParser {

    Directory index;
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig c = new IndexWriterConfig(Version.LUCENE_31, analyzer);

    public void parse(String filePath) throws IndexOutOfBoundsException,
            BiffException, IOException {
        index = FSDirectory.open(new File("d:\\index"));
        Sheet contentSheet = Workbook.getWorkbook(new File(filePath)).getSheet(
                0);
        indexDocs(contentSheet);

    }

    void indexDocs(Sheet contentSheet) throws CorruptIndexException,
            IOException {
        String currentColumn = "";
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31,
                analyzer);
        IndexWriter writer = new IndexWriter(index, iwc);

        for (int i = 0; i < contentSheet.getColumns(); i++) {
            Cell[] xlCells = contentSheet.getColumn(i);
            currentColumn = xlCells[0].getContents();
            StringBuffer sb = new StringBuffer();

            for (int j = 1; j < xlCells.length; j++) {
                sb.append(xlCells[j].getContents() + " ");

            }
            addDoc(writer, sb.toString(), currentColumn);


        }
        writer.close();
    }





    void searcher(String querystr, String onField) throws ParseException,
            CorruptIndexException, IOException {

        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File(
                "d:\\index")));

        Query q = new QueryParser(Version.LUCENE_31, onField, analyzer)
                .parse(querystr);

        int hitsPerPage = 2;

        TopScoreDocCollector collector = TopScoreDocCollector.create(
                hitsPerPage, true);

        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        System.out.println("Found " + hits.length + " hits.");
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println((i + 1) + ". " + d.get("title"));
        }
        searcher.close();

    }

    private static void addDoc(IndexWriter w, String value, String fieldName)
            throws IOException {
        Document doc = new Document();
        doc.add(new Field(fieldName, value, Field.Store.YES,
                Field.Index.ANALYZED));
        w.addDocument(doc);
    }

    public static void main(String[] args) throws IndexOutOfBoundsException,
            BiffException, IOException {
        ExcelParser p = new ExcelParser();

        p.parse("d:\\movieList.xls");

        try {
            p.searcher("the", "title");
        } catch (ParseException e) {

            e.printStackTrace();
        }

    }

}

【问题讨论】：

标签： java search lucene indexing

【解决方案1】：

您正在搜索默认停止过滤器列表中的术语the。

将Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);更改为

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31, new HashSet()); 清除停用词列表。

见：http://lucene.apache.org/java/3_0_1/api/core/org/apache/lucene/analysis/standard/StandardAnalyzer.html

【讨论】：