finallyliuyu
#!/usr/bin/python
#
-*- coding:cp936-*-

#思路,将str转换成unicode,方可用正则表达式,前提是,要知道文件的编码,本例中是gbk
import cPickle as mypickle
import re
import sys
if (__name__==\'__main__\'):
    fid1=file(\'above50purenames.txt\',\'r\');
    p=re.compile(\'(^\s+|\s+$)\');
    phanzigbk=re.compile(\'[\\x20-\\x7f]\');
    phanzi=re.compile(u\'[\u4e00-\u9fa5]\');#这里要加u,注意
    commlines=fid1.readlines();
    fid1.close();
    dictfamilyname={};
    dictfirstname={};
    for line in commlines:
        line=p.sub(\'\',line);
        print type(line);
        print line;
        uline=unicode(line,\'gbk\');
        print type(uline);
        candidates=phanzi.findall(uline);

        print len(candidates);
        if(len(candidates)==2):
            print candidates[0];
            familynamegbk=candidates[0].encode(\'gbk\');#把unicode型的变量变成str型的变量
            firstnamegbk=candidates[1].encode(\'gbk\');
            if(dictfamilyname.has_key(familynamegbk)):
                dictfamilyname[familynamegbk]=dictfamilyname[familynamegbk]+1;
            else:
                dictfamilyname[familynamegbk]=1;
        
            if(dictfirstname.has_key(firstnamegbk)):
                dictfirstname[firstnamegbk]=dictfirstname[firstnamegbk]+1;
            else:
                dictfirstname[firstnamegbk]=1;

    familynameitems=dictfamilyname.items();
    print familynameitems;
    firstnameitems=dictfirstname.items();
    familynameitems.sort(key=lambda d:d[1],reverse=True);
    firstnameitems.sort(key=lambda d :d[1],reverse=True);
    fid=file(\'familyname.txt\',\'w\');
    for m in familynameitems:
        s=m[0]+\'\t\'+str(m[1]);
        fid.write(s);
        fid.write(\'\n\');
    fid.close();
    fid=file(\'firstname.txt\',\'w\');
    for m in firstnameitems:
        s=m[0]+\'\t\'+str(m[1]);
        fid.write(s);
        fid.write(\'\n\');
    fid.close();
    print \'finish\'
   

分类:

技术点:

相关文章:

  • 2021-09-17
  • 2021-11-17
  • 2021-11-17
  • 2021-11-17
  • 2021-12-28
  • 2021-11-17
猜你喜欢
  • 2021-09-07
  • 2021-09-17
  • 2021-09-17
  • 2022-12-23
  • 2021-11-27
相关资源
相似解决方案