finallyliuyu
#!/usr/bin/python
#
-*- coding:gbk-*-
\'\'\'
spec:根据是否命中126W人名,将usrdict分为两个部分
parms:
[IN] 
[IN]
[OUT]
author: liuyusi0121@sogou-inc.com date 20120808
\'\'\'
import re;
import sys;
def LoadKeys(filename):
    \'\'\'
    加载key到内存
    
\'\'\'
    keys=[];
    p=re.compile(\'^\s+|\s+$\');
    fid=file(filename,"r");
    temp=fid.readlines();
    fid.close();
    for line in temp:
        line=p.sub(\'\',line);
        keys.append(line);


    return keys;
def PrintUsage():
    print \'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow\';
    exit(1);

if(__name__=="__main__"):
    delim="\t";
    p=re.compile("(^\\s+|\\s+$)");
    if(len(sys.argv)!=6):
        PrintUsage();
    keyfile=str(sys.argv[1]);
    keys=LoadKeys(keyfile);
    print len(keys);
    inputfile=str(sys.argv[2]);
    outputfile1=str(sys.argv[3]);
    outputfile2=str(sys.argv[4]);
    outputfile3=str(sys.argv[5]);
    fout1=open(outputfile1,\'w\');
    fout2=open(outputfile2,\'w\');
    fout3=open(outputfile3,\'w\');
    fid=open(inputfile,"r");
    linecount=0;
    while True:
        line=fid.readline();
        flag=0;
        if(0==len(line)):
            break;
        line=p.sub(\'\',line);
        if(\'\'==line):
            continue;
        if(0==linecount%100000):
            print \'语料已经处理%d行\'%linecount;
            linecount=linecount+1;
        linesegs=line.split("\t");
        if(4!=len(linesegs)):
            continue;
        if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
            fout3.write(line);
            fout3.write("\n");
            continue;
        try:
            useg=unicode(linesegs[0],\'gbk\');
            count=0;
            for key in keys:
                if(0==count%100000):
                    print \'模式已经扫描%d个\'%count;
                count=count+1;
                patternstr="(^"+key+"|"+key+"$)";
                try:
                    upatternstr=unicode(patternstr,"gbk");
                    pattern=re.compile(upatternstr);
                    if(pattern.search(useg)):
                        print line;
                        flag=1;
                        linesegs.append(key)
                        newline=delim.join(linesegs);
                        fout1.write(newline);
                        fout1.write("\n");
                        break;
                except UnicodeDecodeError:
                    pass;
        except:
            pass;
        if(flag==0):
            linesegs.append("_");
            newline=delim.join(linesegs);
            fout2.write(newline);
            fout2.write("\n");
    fid.close();
    fout1.close();
    fout2.close();
    fout3.close();

分类:

技术点:

相关文章:

  • 2021-11-17
  • 2021-11-17
  • 2021-11-17
  • 2022-12-23
  • 2022-12-23
猜你喜欢
  • 2021-05-27
  • 2022-12-23
  • 2022-12-23
  • 2021-06-02
  • 2022-02-12
  • 2021-11-17
  • 2021-11-17
相关资源
相似解决方案