基于词典的逆向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤。实际分词效果比正向分词效果好

基于词典的逆向最大匹配中文分词算法,更好实现中英文数字混合分词publicclass RMM
}


public class Util
{
//切分出由中文、字母、数字组成的句子
public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
{
ArrayList<Sentence> list=new ArrayList<Sentence>();
StringBuffer cb=new StringBuffer();
int d=reader.read();
int offset=0;
boolean b=false;
while(d>-1)
{
int type=Character.getType(d);
if(type==2 || type==9 || type==5)
{
d=toAscii(d);
cb.append((char)d);
}
else
{
b=true;
}
d=reader.read();
if(d==-1 || b)
{
if(d==-1) offset++;
b=false;
char[] ioBuffer = new char[cb.length()];
cb.getChars(0, cb.length(), ioBuffer, 0);
Sentence sen=new Sentence(ioBuffer,offset-cb.length());
list.add(sen);
cb.setLength(0);
}
offset++;
}
return list;
}

//将相连的单个英文或数字组合成词
public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
ArrayList<Token> tokenlist=new ArrayList<Token>();
Token word=null;
for(int i=0;i<list.size();i++)
{
Token t=list.get(i);
if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
{
if(word==null)
word=t;
else if(word.getEnd()==t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord()+t.getWord());
}
else
{
tokenlist.add(word);
word=t;
}
}
else if(word!=null)
{
tokenlist.add(word);
word=null;
tokenlist.add(t);
}
else
tokenlist.add(t);
}
if(word!=null)
tokenlist.add(word);
return tokenlist;
}

//双角转单角
public static int toAscii(int codePoint)
{
if((codePoint>=65296 && codePoint<=65305) //0-9
|| (codePoint>=65313 && codePoint<=65338) //A-Z
|| (codePoint>=65345 && codePoint<=65370) //a-z
)
{
codePoint -= 65248;
}
return codePoint;
}
}


相关文章: