import json import pandas as pd import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame,Series path='/Users/zhushuqiang/python/download/pydata-book/datasets/bitly_usagov/example.txt' records=[json.loads(str) for str in open(path)] print(records[0].keys()) #测试某个key是否存在 print('g' in records[0].keys()) print('g' in records[0]) #获取时区列表 time_zones=[record['tz'] for record in records if 'tz' in record] #统计个数 def get_count(sequence): #定义一个字典 count={} for element in sequence: if element in count: count[element]+=1 else: count[element]=1 return count count_dict=get_count(time_zones) #top-n def top_n(count_dict,n=10): count_list=[(count,time_zone) for time_zone,count in count_dict.items()] count_list.sort() return count_list[-n:] print(top_n(count_dict)) fd=DataFrame(records) print(fd['tz'][:10]) print(fd['tz'].value_counts()) clean_tz=fd['tz'].fillna('Missing') #学到一招:迭代list中元素,然后判断 clean_tz[clean_tz == '']='Unkown' #print(clean_tz) #统计key的个数 tz_count=clean_tz.value_counts() #画前十个元素的柱状图 tz_count[:10].plot(kind='barh',rot=0) plt.show()
相关文章: