In [1]:

import json
path='usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
records[0]

Out[1]:

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

In [5]:

time_zone=[rec['tz'] for rec in records if 'tz' in rec]
time_zone[:10]   #获取时区

Out[5]:

['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']

In [17]:

from pandas import DataFrame,Series
import pandas as pd
import numpy as np
frame=DataFrame(records)
frame.head(15)

Out[17]:

	_heartbeat_	a	al	c	cy	g	gr	h	hc	hh	kw	l	ll	nk	r	t	tz	u
0	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Danvers	A6qOVH	MA	wfLQtf	1.331823e+09	1.usa.gov	NaN	orofrog	[42.576698, -70.954903]	1.0	http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...	1.331923e+09	America/New_York	http://www.ncbi.nlm.nih.gov/pubmed/22415991
1	NaN	GoogleMaps/RochesterNY	NaN	US	Provo	mwszkS	UT	mwszkS	1.308262e+09	j.mp	NaN	bitly	[40.218102, -111.613297]	0.0	http://www.AwareMap.com/	1.331923e+09	America/Denver	http://www.monroecounty.gov/etc/911/rss.php
2	NaN	Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...	en-US	US	Washington	xxr3Qb	DC	xxr3Qb	1.331920e+09	1.usa.gov	NaN	bitly	[38.9007, -77.043098]	1.0	http://t.co/03elZC4Q	1.331923e+09	America/New_York	http://boxer.senate.gov/en/press/releases/0316...
3	NaN	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...	pt-br	BR	Braz	zCaLwp	27	zUtuOu	1.331923e+09	1.usa.gov	NaN	alelex88	[-23.549999, -46.616699]	0.0	direct	1.331923e+09	America/Sao_Paulo	http://apod.nasa.gov/apod/ap120312.html
4	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Shrewsbury	9b6kNl	MA	9b6kNl	1.273672e+09	bit.ly	NaN	bitly	[42.286499, -71.714699]	0.0	http://www.shrewsbury-ma.gov/selco/	1.331923e+09	America/New_York	http://www.shrewsbury-ma.gov/egov/gallery/1341...
5	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Shrewsbury	axNK8c	MA	axNK8c	1.273673e+09	bit.ly	NaN	bitly	[42.286499, -71.714699]	0.0	http://www.shrewsbury-ma.gov/selco/	1.331923e+09	America/New_York	http://www.shrewsbury-ma.gov/egov/gallery/1341...
6	NaN	Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1...	pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4	PL	Luban	wcndER	77	zkpJBR	1.331923e+09	1.usa.gov	NaN	bnjacobs	[51.116699, 15.2833]	0.0	http://plus.url.google.com/url?sa=z&n=13319232...	1.331923e+09	Europe/Warsaw	http://www.nasa.gov/mission_pages/nustar/main/...
7	NaN	Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2...	bg,en-us;q=0.7,en;q=0.3	None	NaN	wcndER	NaN	zkpJBR	1.331923e+09	1.usa.gov	NaN	bnjacobs	NaN	0.0	http://www.facebook.com/	1.331923e+09		http://www.nasa.gov/mission_pages/nustar/main/...
8	NaN	Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1...	en-US, en	None	NaN	wcndER	NaN	zkpJBR	1.331923e+09	1.usa.gov	NaN	bnjacobs	NaN	0.0	http://www.facebook.com/l.php?u=http%3A%2F%2F1...	1.331923e+09		http://www.nasa.gov/mission_pages/nustar/main/...
9	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4	None	NaN	zCaLwp	NaN	zUtuOu	1.331923e+09	1.usa.gov	NaN	alelex88	NaN	0.0	http://t.co/o1Pd0WeV	1.331923e+09		http://apod.nasa.gov/apod/ap120312.html
10	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...	en-us,en;q=0.5	US	Seattle	vNJS4H	WA	u0uD9q	1.319564e+09	1.usa.gov	NaN	o_4us71ccioa	[47.5951, -122.332603]	1.0	direct	1.331923e+09	America/Los_Angeles	https://www.nysdot.gov/rexdesign/design/commun...
11	NaN	Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4...	en-us,en;q=0.5	US	Washington	wG7OIH	DC	A0nRz4	1.331816e+09	1.usa.gov	NaN	darrellissa	[38.937599, -77.092796]	0.0	http://t.co/ND7SoPyo	1.331923e+09	America/New_York	http://oversight.house.gov/wp-content/uploads/...
12	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...	en-us,en;q=0.5	US	Alexandria	vNJS4H	VA	u0uD9q	1.319564e+09	1.usa.gov	NaN	o_4us71ccioa	[38.790901, -77.094704]	1.0	direct	1.331923e+09	America/New_York	https://www.nysdot.gov/rexdesign/design/commun...
13	1.331923e+09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
14	NaN	Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...	en-us,en;q=0.5	US	Marietta	2rOUYc	GA	2rOUYc	1.255770e+09	1.usa.gov	NaN	bitly	[33.953201, -84.5177]	1.0	direct	1.331923e+09	America/New_York	http://toxtown.nlm.nih.gov/index.php

In [8]:

tz_counts=frame['tz'].value_counts()
tz_counts

Out[8]:

America/New_York                  1251
                                   521
America/Chicago                    400
America/Los_Angeles                382
America/Denver                     191
Europe/London                       74
Asia/Tokyo                          37
Pacific/Honolulu                    36
Europe/Madrid                       35
America/Sao_Paulo                   33
Europe/Berlin                       28
Europe/Rome                         27
America/Rainy_River                 25
Europe/Amsterdam                    22
America/Indianapolis                20
America/Phoenix                     20
Europe/Warsaw                       16
America/Mexico_City                 15
Europe/Stockholm                    14
Europe/Paris                        14
America/Vancouver                   12
Pacific/Auckland                    11
America/Puerto_Rico                 10
Asia/Hong_Kong                      10
Europe/Prague                       10
Europe/Oslo                         10
Europe/Helsinki                     10
Europe/Moscow                       10
Asia/Istanbul                        9
Asia/Calcutta                        9
                                  ... 
Europe/Belgrade                      2
Africa/Johannesburg                  1
America/Lima                         1
Africa/Lusaka                        1
Australia/Queensland                 1
Asia/Riyadh                          1
America/Tegucigalpa                  1
Asia/Novosibirsk                     1
America/La_Paz                       1
America/Montevideo                   1
Asia/Kuching                         1
America/Mazatlan                     1
America/Argentina/Mendoza            1
Asia/Nicosia                         1
Europe/Ljubljana                     1
America/Costa_Rica                   1
Asia/Yekaterinburg                   1
America/Santo_Domingo                1
Asia/Pontianak                       1
Europe/Uzhgorod                      1
America/Argentina/Cordoba            1
Europe/Sofia                         1
Africa/Casablanca                    1
Asia/Manila                          1
Europe/Volgograd                     1
Europe/Skopje                        1
America/Caracas                      1
America/Argentina/Buenos_Aires       1
America/Monterrey                    1
America/St_Kitts                     1
Name: tz, Length: 97, dtype: int64

In [23]:

clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'        #处理缺失数据
print(clean_tz)
tz_counts=clean_tz.value_counts()
tz_counts[:10]

0          America/New_York
1            America/Denver
2          America/New_York
3         America/Sao_Paulo
4          America/New_York
5          America/New_York
6             Europe/Warsaw
7                   Unknown
8                   Unknown
9                   Unknown
10      America/Los_Angeles
11         America/New_York
12         America/New_York
13                  Missing
14         America/New_York
15           Asia/Hong_Kong
16           Asia/Hong_Kong
17         America/New_York
18           America/Denver
19              Europe/Rome
20             Africa/Ceuta
21         America/New_York
22         America/New_York
23         America/New_York
24            Europe/Madrid
25        Asia/Kuala_Lumpur
26             Asia/Nicosia
27        America/Sao_Paulo
28                  Unknown
29                  Unknown
               ...         
3530    America/Los_Angeles
3531                Unknown
3532       America/New_York
3533       America/New_York
3534        America/Chicago
3535        America/Chicago
3536                Unknown
3537    America/Tegucigalpa
3538    America/Los_Angeles
3539    America/Los_Angeles
3540         America/Denver
3541    America/Los_Angeles
3542    America/Los_Angeles
3543                Missing
3544        America/Chicago
3545        America/Chicago
3546    America/Los_Angeles
3547       America/New_York
3548        America/Chicago
3549       Europe/Stockholm
3550       America/New_York
3551                Unknown
3552        America/Chicago
3553       America/New_York
3554       America/New_York
3555       America/New_York
3556        America/Chicago
3557         America/Denver
3558    America/Los_Angeles
3559       America/New_York
Name: tz, Length: 3560, dtype: object

Out[23]:

America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64

In [32]:

%pylab
tz_counts[:10].plot(kind='barh',rot=0)

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib

Out[32]:

<matplotlib.axes._subplots.AxesSubplot at 0x19f6f8d7278>

In [37]:

results=Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
results.value_counts()[:6]    #数一下各有多少个

0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object

Out[37]:

Mozilla/5.0               2594
Mozilla/4.0                601
GoogleMaps/RochesterNY     121
Opera/9.80                  34
TEST_INTERNET_AGENT         24
GoogleProducer              21
dtype: int64

In [38]:

#统计Windows用户
cframe=frame[frame.a.notnull()]
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]

Out[38]:

array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
      dtype='<U11')

In [47]:

by_tz_os=cframe.groupby(['tz',operating_system])
by_tz_os.size().unstack().fillna(0)   # 牛！

Out[47]:

	Not Windows	Windows
tz
	245.0	276.0
Africa/Cairo	0.0	3.0
Africa/Casablanca	0.0	1.0
Africa/Ceuta	0.0	2.0
Africa/Johannesburg	0.0	1.0
Africa/Lusaka	0.0	1.0
America/Anchorage	4.0	1.0
America/Argentina/Buenos_Aires	1.0	0.0
America/Argentina/Cordoba	0.0	1.0
America/Argentina/Mendoza	0.0	1.0
America/Bogota	1.0	2.0
America/Caracas	0.0	1.0
America/Chicago	115.0	285.0
America/Chihuahua	1.0	1.0
America/Costa_Rica	0.0	1.0
America/Denver	132.0	59.0
America/Edmonton	2.0	4.0
America/Guayaquil	2.0	0.0
America/Halifax	1.0	3.0
America/Indianapolis	8.0	12.0
America/La_Paz	0.0	1.0
America/Lima	0.0	1.0
America/Los_Angeles	130.0	252.0
America/Managua	0.0	3.0
America/Mazatlan	1.0	0.0
America/Mexico_City	7.0	8.0
America/Monterrey	1.0	0.0
America/Montevideo	0.0	1.0
America/Montreal	3.0	6.0
America/New_York	339.0	912.0
...	...	...
Europe/Berlin	9.0	19.0
Europe/Bratislava	1.0	2.0
Europe/Brussels	1.0	3.0
Europe/Bucharest	1.0	3.0
Europe/Budapest	0.0	5.0
Europe/Copenhagen	2.0	3.0
Europe/Dublin	1.0	2.0
Europe/Helsinki	2.0	8.0
Europe/Lisbon	1.0	7.0
Europe/Ljubljana	0.0	1.0
Europe/London	43.0	31.0
Europe/Madrid	16.0	19.0
Europe/Malta	0.0	2.0
Europe/Moscow	1.0	9.0
Europe/Oslo	2.0	8.0
Europe/Paris	4.0	10.0
Europe/Prague	3.0	7.0
Europe/Riga	1.0	1.0
Europe/Rome	8.0	19.0
Europe/Skopje	0.0	1.0
Europe/Sofia	0.0	1.0
Europe/Stockholm	2.0	12.0
Europe/Uzhgorod	0.0	1.0
Europe/Vienna	3.0	3.0
Europe/Vilnius	0.0	2.0
Europe/Volgograd	0.0	1.0
Europe/Warsaw	1.0	15.0
Europe/Zurich	4.0	0.0
Pacific/Auckland	3.0	8.0
Pacific/Honolulu	0.0	36.0

97 rows × 2 columns

In [7]:

#MovieLens 1M数据集
unames=['user_id','gender','age','occupation','zip']
users=pd.read_table('users.dat',sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('ratings.dat',sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)

D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  import sys

In [8]:

users.head()

Out[8]:

	user_id	gender	age	occupation	zip
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455

In [9]:

ratings[:5]

Out[9]:

	user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

In [10]:

movies[:5]

Out[10]:

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

In [12]:

data=pd.merge(pd.merge(ratings,users),movies)
data

Out[12]:

	user_id	movie_id	rating	timestamp	gender	age	occupation	zip	title	genres
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama
5	18	1193	4	978156168	F	18	3	95825	One Flew Over the Cuckoo's Nest (1975)	Drama
6	19	1193	5	982730936	M	1	10	48073	One Flew Over the Cuckoo's Nest (1975)	Drama
7	24	1193	5	978136709	F	25	7	10023	One Flew Over the Cuckoo's Nest (1975)	Drama
8	28	1193	3	978125194	F	25	1	14607	One Flew Over the Cuckoo's Nest (1975)	Drama
9	33	1193	5	978557765	M	45	3	55421	One Flew Over the Cuckoo's Nest (1975)	Drama
10	39	1193	5	978043535	M	18	4	61820	One Flew Over the Cuckoo's Nest (1975)	Drama
11	42	1193	3	978038981	M	25	8	24502	One Flew Over the Cuckoo's Nest (1975)	Drama
12	44	1193	4	978018995	M	45	17	98052	One Flew Over the Cuckoo's Nest (1975)	Drama
13	47	1193	4	977978345	M	18	4	94305	One Flew Over the Cuckoo's Nest (1975)	Drama
14	48	1193	4	977975061	M	25	4	92107	One Flew Over the Cuckoo's Nest (1975)	Drama
15	49	1193	4	978813972	M	18	12	77084	One Flew Over the Cuckoo's Nest (1975)	Drama
16	53	1193	5	977946400	M	25	0	96931	One Flew Over the Cuckoo's Nest (1975)	Drama
17	54	1193	5	977944039	M	50	1	56723	One Flew Over the Cuckoo's Nest (1975)	Drama
18	58	1193	5	977933866	M	25	2	30303	One Flew Over the Cuckoo's Nest (1975)	Drama
19	59	1193	4	977934292	F	50	1	55413	One Flew Over the Cuckoo's Nest (1975)	Drama
20	62	1193	4	977968584	F	35	3	98105	One Flew Over the Cuckoo's Nest (1975)	Drama
21	80	1193	4	977786172	M	56	1	49327	One Flew Over the Cuckoo's Nest (1975)	Drama
22	81	1193	5	977785864	F	25	0	60640	One Flew Over the Cuckoo's Nest (1975)	Drama
23	88	1193	5	977694161	F	45	1	02476	One Flew Over the Cuckoo's Nest (1975)	Drama
24	89	1193	5	977683596	F	56	9	85749	One Flew Over the Cuckoo's Nest (1975)	Drama
25	95	1193	5	977626632	M	45	0	98201	One Flew Over the Cuckoo's Nest (1975)	Drama
26	96	1193	3	977621789	F	25	16	78028	One Flew Over the Cuckoo's Nest (1975)	Drama
27	99	1193	2	982791053	F	1	10	19390	One Flew Over the Cuckoo's Nest (1975)	Drama
28	102	1193	5	1040737607	M	35	19	20871	One Flew Over the Cuckoo's Nest (1975)	Drama
29	104	1193	2	977546620	M	25	12	00926	One Flew Over the Cuckoo's Nest (1975)	Drama
...	...	...	...	...	...	...	...	...	...	...
1000179	4933	3084	3	962757020	M	25	15	94040	Home Page (1999)	Documentary
1000180	4802	2218	2	1014866656	M	56	1	40601	Juno and Paycock (1930)	Drama
1000181	4812	2308	2	962932391	M	18	14	25301	Detroit 9000 (1973)	Action\|Crime
1000182	4874	624	4	962781918	F	25	4	70808	Condition Red (1995)	Action\|Drama\|Thriller
1000183	5059	1434	4	962484364	M	45	16	22652	Stranger, The (1994)	Action
1000184	5947	1434	4	957190428	F	45	16	97215	Stranger, The (1994)	Action
1000185	5077	1868	3	962417299	M	25	2	20037	Truce, The (1996)	Drama\|War
1000186	5944	1868	1	957197520	F	18	10	27606	Truce, The (1996)	Drama\|War
1000187	5105	404	3	962337582	M	50	7	18977	Brother Minister: The Assassination of Malcolm...	Documentary
1000188	5185	404	4	963402617	F	35	4	44485	Brother Minister: The Assassination of Malcolm...	Documentary
1000189	5532	404	5	959619841	M	25	17	27408	Brother Minister: The Assassination of Malcolm...	Documentary
1000190	5543	404	3	960127592	M	25	17	97401	Brother Minister: The Assassination of Malcolm...	Documentary
1000191	5220	2543	3	961546137	M	25	7	91436	Six Ways to Sunday (1997)	Comedy
1000192	5754	2543	4	958272316	F	18	1	60640	Six Ways to Sunday (1997)	Comedy
1000193	5227	591	3	961475931	M	18	10	64050	Tough and Deadly (1995)	Action\|Drama\|Thriller
1000194	5795	591	1	958145253	M	25	1	92688	Tough and Deadly (1995)	Action\|Drama\|Thriller
1000195	5313	3656	5	960920392	M	56	0	55406	Lured (1947)	Crime
1000196	5328	2438	4	960838075	F	25	4	91740	Outside Ozona (1998)	Drama\|Thriller
1000197	5334	3323	3	960796159	F	56	13	46140	Chain of Fools (2000)	Comedy\|Crime
1000198	5334	127	1	960795494	F	56	13	46140	Silence of the Palace, The (Saimt el Qusur) (1...	Drama
1000199	5334	3382	5	960796159	F	56	13	46140	Song of Freedom (1936)	Drama
1000200	5420	1843	3	960156505	F	1	19	14850	Slappy and the Stinkers (1998)	Children's\|Comedy
1000201	5433	286	3	960240881	F	35	17	45014	Nemesis 2: Nebula (1995)	Action\|Sci-Fi\|Thriller
1000202	5494	3530	4	959816296	F	35	17	94306	Smoking/No Smoking (1993)	Comedy
1000203	5556	2198	3	959445515	M	45	6	92103	Modulations (1998)	Documentary
1000204	5949	2198	5	958846401	M	18	17	47901	Modulations (1998)	Documentary
1000205	5675	2703	3	976029116	M	35	14	30030	Broken Vessels (1998)	Drama
1000206	5780	2845	1	958153068	M	18	17	92886	White Boys (1999)	Drama
1000207	5851	3607	5	957756608	F	18	20	55410	One Little Indian (1973)	Comedy\|Drama\|Western
1000208	5938	2909	4	957273353	M	25	1	35401	Five Wives, Three Secretaries and Me (1998)	Documentary

1000209 rows × 10 columns

In [13]:

mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings.head()

Out[13]:

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024

In [23]:

ratings_by_title=data.groupby('title').size()  #根据电影名字分组
ratings_by_title[:10]

Out[23]:

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [31]:

active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles       #评分数据大于250条的电影名称

Out[31]:

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

In [39]:

mean_ratings=mean_ratings.loc[active_titles]
mean_ratings

Out[39]:

gender	F	M
title
'burbs, The (1989)	2.793478	2.962085
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421
13th Warrior, The (1999)	3.112000	3.168000
2 Days in the Valley (1996)	3.488889	3.244813
20,000 Leagues Under the Sea (1954)	3.670103	3.709205
2001: A Space Odyssey (1968)	3.825581	4.129738
2010 (1984)	3.446809	3.413712
28 Days (2000)	3.209424	2.977707
39 Steps, The (1935)	3.965517	4.107692
54 (1998)	2.701754	2.782178
7th Voyage of Sinbad, The (1958)	3.409091	3.658879
8MM (1999)	2.906250	2.850962
About Last Night... (1986)	3.188679	3.140909
Absent Minded Professor, The (1961)	3.469388	3.446809
Absolute Power (1997)	3.469136	3.327759
Abyss, The (1989)	3.659236	3.689507
Ace Ventura: Pet Detective (1994)	3.000000	3.197917
Ace Ventura: When Nature Calls (1995)	2.269663	2.543333
Addams Family Values (1993)	3.000000	2.878531
Addams Family, The (1991)	3.186170	3.163498
Adventures in Babysitting (1987)	3.455782	3.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)	3.308511	3.402321
Adventures of Priscilla, Queen of the Desert, The (1994)	3.989071	3.688811
Adventures of Robin Hood, The (1938)	4.166667	3.918367
African Queen, The (1951)	4.324232	4.223822
Age of Innocence, The (1993)	3.827068	3.339506
Agnes of God (1985)	3.534884	3.244898
...	...	...
White Men Can't Jump (1992)	3.028777	3.231061
Who Framed Roger Rabbit? (1988)	3.569378	3.713251
Who's Afraid of Virginia Woolf? (1966)	4.029703	4.096939
Whole Nine Yards, The (2000)	3.296552	3.404814
Wild Bunch, The (1969)	3.636364	4.128099
Wild Things (1998)	3.392000	3.459082
Wild Wild West (1999)	2.275449	2.131973
William Shakespeare's Romeo and Juliet (1996)	3.532609	3.318644
Willow (1988)	3.658683	3.453543
Willy Wonka and the Chocolate Factory (1971)	4.063953	3.789474
Witness (1985)	4.115854	3.941504
Wizard of Oz, The (1939)	4.355030	4.203138
Wolf (1994)	3.074074	2.899083
Women on the Verge of a Nervous Breakdown (1988)	3.934307	3.865741
Wonder Boys (2000)	4.043796	3.913649
Working Girl (1988)	3.606742	3.312500
World Is Not Enough, The (1999)	3.337500	3.388889
Wrong Trousers, The (1993)	4.588235	4.478261
Wyatt Earp (1994)	3.147059	3.283898
X-Files: Fight the Future, The (1998)	3.489474	3.493797
X-Men (2000)	3.682310	3.851702
Year of Living Dangerously (1982)	3.951220	3.869403
Yellow Submarine (1968)	3.714286	3.689286
You've Got Mail (1998)	3.542424	3.275591
Young Frankenstein (1974)	4.289963	4.239177
Young Guns (1988)	3.371795	3.425620
Young Guns II (1990)	2.934783	2.904025
Young Sherlock Holmes (1985)	3.514706	3.363344
Zero Effect (1998)	3.864407	3.723140
eXistenZ (1999)	3.098592	3.289086

1216 rows × 2 columns

In [45]:

top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]      #女性最喜欢的十大电影排行

Out[45]:

gender	F	M
title
Close Shave, A (1995)	4.644444	4.473795
Wrong Trousers, The (1993)	4.588235	4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)	4.572650	4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)	4.563107	4.385075
Schindler's List (1993)	4.562602	4.491415
Shawshank Redemption, The (1994)	4.539075	4.560625
Grand Day Out, A (1992)	4.537879	4.293255
To Kill a Mockingbird (1962)	4.536667	4.372611
Creature Comforts (1990)	4.513889	4.272277
Usual Suspects, The (1995)	4.513317	4.518248

In [51]:

#找出男女分歧最大的电影
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff=mean_ratings.sort_values(by='diff')     #sort_values排序
print(sorted_by_diff[:5])
print(sorted_by_diff[::-1][:5])
sorted_by_diff[-5:]           #男女差别较大且 男性更喜欢的电影

gender                            F         M      diff
title                                                  
Dirty Dancing (1987)       3.790378  2.959596 -0.830782
Jumpin' Jack Flash (1986)  3.254717  2.578358 -0.676359
Grease (1978)              3.975265  3.367041 -0.608224
Little Women (1994)        3.870588  3.321739 -0.548849
Steel Magnolias (1989)     3.901734  3.365957 -0.535777
gender                                         F         M      diff
title                                                               
Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
Dumb & Dumber (1994)                    2.697987  3.336595  0.638608
Longest Day, The (1962)                 3.411765  4.031447  0.619682
Cable Guy, The (1996)                   2.250000  2.863787  0.613787

Out[51]:

gender	F	M	diff
title
Cable Guy, The (1996)	2.250000	2.863787	0.613787
Longest Day, The (1962)	3.411765	4.031447	0.619682
Dumb & Dumber (1994)	2.697987	3.336595	0.638608
Kentucky Fried Movie, The (1977)	2.878788	3.555147	0.676359
Good, The Bad and The Ugly, The (1966)	3.494949	4.221300	0.726351

In [57]:

#不考虑性别因素 只找出分歧最大的电影
rating_std_by_title=data.groupby('title')['rating'].std()   
print(rating_std_by_title[:5])       #每个电影打分的方差
rating_std_by_title=rating_std_by_title.loc[active_titles]
print(type(rating_std_by_title))
rating_std_by_title.sort_values(ascending=False)[:5]

title
$1,000,000 Duck (1971)           1.092563
'Night Mother (1986)             1.118636
'Til There Was You (1997)        1.020159
'burbs, The (1989)               1.107760
...And Justice for All (1979)    0.878110
Name: rating, dtype: float64
<class 'pandas.core.series.Series'>

Out[57]:

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Name: rating, dtype: float64

In [1]:

#全美婴儿姓名分析
names1880=pd.read_csv('yob1880.txt',names=['name','sex','births'])
names1880.head()

Out[1]:

	name	sex	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746

In [62]:

names1880.groupby('sex')['births'].sum()

Out[62]:

sex
F     90993
M    110493
Name: births, dtype: int64

In [2]:

years=range(1880,2011)
pieces=[]
columns=['names','sex','births']
for year in years:
    path='yob%d.txt' %year
    frame=pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)
#print(pieces)
names=pd.concat(pieces,ignore_index=True)

In [78]:

names.head()

Out[78]:

	names	sex	births	year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

In [3]:

total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.head()

Out[3]:

sex	F	M
year
1880	90993	110493
1881	91955	100748
1882	107851	113687
1883	112322	104632
1884	129021	114445

In [87]:

import matplotlib.pyplot as plt
total_births.plot(title="Total births by sex and year")
plt.show()

In [29]:

def add_prop(group):   #名字占比
    births=group.births.astype(float)
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop)
names.head()

Out[29]:

	names	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188

In [31]:

def get_top1000(group):    #获取每年各男女前1000的
    return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000[:10]

Out[31]:

			names	sex	births	year	prop
year	sex
1880	F	0	Mary	F	7065	1880	0.077643
		1	Anna	F	2604	1880	0.028618
		2	Emma	F	2003	1880	0.022013
		3	Elizabeth	F	1939	1880	0.021309
		4	Minnie	F	1746	1880	0.019188
		5	Margaret	F	1578	1880	0.017342
		6	Ida	F	1472	1880	0.016177
		7	Alice	F	1414	1880	0.015540
		8	Bertha	F	1320	1880	0.014507
		9	Sarah	F	1288	1880	0.014155

In [32]:

boys=top1000[top1000['sex']=='M']
girls=top1000[top1000['sex']=='F']
total_births=top1000.pivot_table('births',index='year',columns='names',aggfunc=sum)
total_births.head()   #每年叫这些名字的人数

D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

Out[32]:

names	Aaden	Aaliyah	Aarav	Aaron	Aarush	Ab	Abagail	Abb	Abbey	Abbie	...	Zoa	Zoe	Zoey	Zoie	Zola	Zollie	Zona	Zora	Zula	Zuri
year
1880	NaN	NaN	NaN	102.0	NaN	NaN	NaN	NaN	NaN	71.0	...	8.0	23.0	NaN	NaN	7.0	NaN	8.0	28.0	27.0	NaN
1881	NaN	NaN	NaN	94.0	NaN	NaN	NaN	NaN	NaN	81.0	...	NaN	22.0	NaN	NaN	10.0	NaN	9.0	21.0	27.0	NaN
1882	NaN	NaN	NaN	85.0	NaN	NaN	NaN	NaN	NaN	80.0	...	8.0	25.0	NaN	NaN	9.0	NaN	17.0	32.0	21.0	NaN
1883	NaN	NaN	NaN	105.0	NaN	NaN	NaN	NaN	NaN	79.0	...	NaN	23.0	NaN	NaN	10.0	NaN	11.0	35.0	25.0	NaN
1884	NaN	NaN	NaN	97.0	NaN	NaN	NaN	NaN	NaN	98.0	...	13.0	31.0	NaN	NaN	14.0	6.0	8.0	58.0	27.0	NaN

5 rows × 6868 columns

In [33]:

import matplotlib.pyplot as plt
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=True,title="Number of births per year")
plt.show()

In [39]:

table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum) #前1000名字占比
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show()   #每年前1000个名字占比逐渐减少 说明命名更加多样化

D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

In [44]:

df=boys[boys['year']==2010]
df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677645)
Data columns (total 5 columns):
names     1000 non-null object
sex       1000 non-null object
births    1000 non-null int64
year      1000 non-null int64
prop      1000 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.0+ MB

In [52]:

#最后一个字母的变革
get_last_letter=lambda x:x[-1]
last_letters=names.names.map(get_last_letter)   #map匿名函数
last_letters.name='last_letter'
last_letters.head()

Out[52]:

0    y
1    a
2    a
3    h
4    e
Name: last_letter, dtype: object

In [55]:

table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()

Out[55]:

sex	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376.0	691247.0	670605.0	977.0	5204.0	28438.0
b	NaN	694.0	450.0	411.0	3912.0	38859.0
c	5.0	49.0	946.0	482.0	15476.0	23125.0
d	6750.0	3729.0	2607.0	22111.0	262112.0	44398.0
e	133569.0	435013.0	313833.0	28655.0	178823.0	129012.0

In [56]:

subtable.sum()   #男女各年总数

Out[56]:

sex  year
F    1910     396416.0
     1960    2022062.0
     2010    1759010.0
M    1910     194198.0
     1960    2132588.0
     2010    1898382.0
dtype: float64

In [59]:

letter_prop=subtable/subtable.sum().astype(float)
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
plt.show()