前提nutch1.2已部署到eclipse中
详见:http://www.cnblogs.com/cy163/archive/2013/02/19/2916419.html
1 部署IKAnalyzer3.2.8
1.1 下载IKAnalyzer3.2.8
1.2 将IKAnalyzer3.2.8复制到nutch/lib中,并在eclipse中添加该jar
2 修改analysis源码
2.1 在src/java/org/apache/nutch/analysis包下找到NutchAnalysis.jj
(1)在 PARSER_BEGIN(NutchAnalysis)部分的导入声明中增加:
import org.wltea.analyzer.lucene.IKTokenizer;
(2)在 TOKEN_MGR_DECLS中增加:
IKTokenizer Analyzer; TermAttribute termAtt = null;//代表用空格分割器分出来的一个中文词 OffsetAttribute offAtt = null;//中文词开始结束标记 TokenStream stream = null; private int cjkStartOffset = 0;//中文片段的起始位置定义
(3)在 TOKEN 部分,找到| <SIGRAM: <CJK> >,这代表按字划分,修改为| <SIGRAM: (<CJK>)+ >
再在后面加上:
1 { 2 if (stream == null) { 3 stream = new IKTokenizer(new StringReader(image.toString()),true); 4 //stream = Analyzer.tokenStream("",new StringReader(image.toString())); 5 cjkStartOffset = matchedToken.beginColumn; 6 try { 7 stream.reset(); 8 } catch (IOException e) { 9 e.printStackTrace(); 10 } 11 termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); 12 offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class); 13 try { 14 if (stream.incrementToken() == false) 15 termAtt = null; 16 } catch (IOException e) { 17 // TODO Auto-generated catch block 18 e.printStackTrace(); 19 } 20 } 21 if (termAtt != null && !termAtt.term().equals("")) { 22 matchedToken.image = termAtt.term(); 23 matchedToken.beginColumn = cjkStartOffset + offAtt.startOffset(); 24 matchedToken.endColumn = cjkStartOffset + offAtt.endOffset(); 25 try { 26 if (stream.incrementToken() != false) 27 input_stream.backup(1); 28 else 29 termAtt = null; 30 } catch (IOException e) { 31 e.printStackTrace(); 32 } 33 } 34 if (termAtt == null || termAtt.term().equals("")) { 35 stream = null; 36 cjkStartOffset = 0; 37 } 38 }