#-*- coding:cp936-*-
#思路,将str转换成unicode,方可用正则表达式,前提是,要知道文件的编码,本例中是gbk
import cPickle as mypickle
import re
import sys
if (__name__==\'__main__\'):
fid1=file(\'above50purenames.txt\',\'r\');
p=re.compile(\'(^\s+|\s+$)\');
phanzigbk=re.compile(\'[\\x20-\\x7f]\');
phanzi=re.compile(u\'[\u4e00-\u9fa5]\');#这里要加u,注意
commlines=fid1.readlines();
fid1.close();
dictfamilyname={};
dictfirstname={};
for line in commlines:
line=p.sub(\'\',line);
print type(line);
print line;
uline=unicode(line,\'gbk\');
print type(uline);
candidates=phanzi.findall(uline);
print len(candidates);
if(len(candidates)==2):
print candidates[0];
familynamegbk=candidates[0].encode(\'gbk\');#把unicode型的变量变成str型的变量
firstnamegbk=candidates[1].encode(\'gbk\');
if(dictfamilyname.has_key(familynamegbk)):
dictfamilyname[familynamegbk]=dictfamilyname[familynamegbk]+1;
else:
dictfamilyname[familynamegbk]=1;
if(dictfirstname.has_key(firstnamegbk)):
dictfirstname[firstnamegbk]=dictfirstname[firstnamegbk]+1;
else:
dictfirstname[firstnamegbk]=1;
familynameitems=dictfamilyname.items();
print familynameitems;
firstnameitems=dictfirstname.items();
familynameitems.sort(key=lambda d:d[1],reverse=True);
firstnameitems.sort(key=lambda d :d[1],reverse=True);
fid=file(\'familyname.txt\',\'w\');
for m in familynameitems:
s=m[0]+\'\t\'+str(m[1]);
fid.write(s);
fid.write(\'\n\');
fid.close();
fid=file(\'firstname.txt\',\'w\');
for m in firstnameitems:
s=m[0]+\'\t\'+str(m[1]);
fid.write(s);
fid.write(\'\n\');
fid.close();
print \'finish\'