In [1]:
import json
path='usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
records[0]
Out[1]:
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
In [5]:
time_zone=[rec['tz'] for rec in records if 'tz' in rec]
time_zone[:10]   #获取时区
Out[5]:
['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']
In [17]:
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
frame=DataFrame(records)
frame.head(15)
Out[17]:
  _heartbeat_ a al c cy g gr h hc hh kw l ll nk r t tz u
0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991
1 NaN GoogleMaps/RochesterNY NaN US Provo mwszkS UT mwszkS 1.308262e+09 j.mp NaN bitly [40.218102, -111.613297] 0.0 http://www.AwareMap.com/ 1.331923e+09 America/Denver http://www.monroecounty.gov/etc/911/rss.php
2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 http://t.co/03elZC4Q 1.331923e+09 America/New_York http://boxer.senate.gov/en/press/releases/0316...
3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 direct 1.331923e+09 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html
4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
5 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Shrewsbury axNK8c MA axNK8c 1.273673e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
6 NaN Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4 PL Luban wcndER 77 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [51.116699, 15.2833] 0.0 http://plus.url.google.com/url?sa=z&n=13319232... 1.331923e+09 Europe/Warsaw http://www.nasa.gov/mission_pages/nustar/main/...
7 NaN Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2... bg,en-us;q=0.7,en;q=0.3 None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 http://www.facebook.com/ 1.331923e+09   http://www.nasa.gov/mission_pages/nustar/main/...
8 NaN Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1... en-US, en None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331923e+09   http://www.nasa.gov/mission_pages/nustar/main/...
9 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4 None NaN zCaLwp NaN zUtuOu 1.331923e+09 1.usa.gov NaN alelex88 NaN 0.0 http://t.co/o1Pd0WeV 1.331923e+09   http://apod.nasa.gov/apod/ap120312.html
10 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Seattle vNJS4H WA u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [47.5951, -122.332603] 1.0 direct 1.331923e+09 America/Los_Angeles https://www.nysdot.gov/rexdesign/design/commun...
11 NaN Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4... en-us,en;q=0.5 US Washington wG7OIH DC A0nRz4 1.331816e+09 1.usa.gov NaN darrellissa [38.937599, -77.092796] 0.0 http://t.co/ND7SoPyo 1.331923e+09 America/New_York http://oversight.house.gov/wp-content/uploads/...
12 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Alexandria vNJS4H VA u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [38.790901, -77.094704] 1.0 direct 1.331923e+09 America/New_York https://www.nysdot.gov/rexdesign/design/commun...
13 1.331923e+09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 NaN Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US... en-us,en;q=0.5 US Marietta 2rOUYc GA 2rOUYc 1.255770e+09 1.usa.gov NaN bitly [33.953201, -84.5177] 1.0 direct 1.331923e+09 America/New_York http://toxtown.nlm.nih.gov/index.php
In [8]:
tz_counts=frame['tz'].value_counts()
tz_counts
Out[8]:
America/New_York                  1251
                                   521
America/Chicago                    400
America/Los_Angeles                382
America/Denver                     191
Europe/London                       74
Asia/Tokyo                          37
Pacific/Honolulu                    36
Europe/Madrid                       35
America/Sao_Paulo                   33
Europe/Berlin                       28
Europe/Rome                         27
America/Rainy_River                 25
Europe/Amsterdam                    22
America/Indianapolis                20
America/Phoenix                     20
Europe/Warsaw                       16
America/Mexico_City                 15
Europe/Stockholm                    14
Europe/Paris                        14
America/Vancouver                   12
Pacific/Auckland                    11
America/Puerto_Rico                 10
Asia/Hong_Kong                      10
Europe/Prague                       10
Europe/Oslo                         10
Europe/Helsinki                     10
Europe/Moscow                       10
Asia/Istanbul                        9
Asia/Calcutta                        9
                                  ... 
Europe/Belgrade                      2
Africa/Johannesburg                  1
America/Lima                         1
Africa/Lusaka                        1
Australia/Queensland                 1
Asia/Riyadh                          1
America/Tegucigalpa                  1
Asia/Novosibirsk                     1
America/La_Paz                       1
America/Montevideo                   1
Asia/Kuching                         1
America/Mazatlan                     1
America/Argentina/Mendoza            1
Asia/Nicosia                         1
Europe/Ljubljana                     1
America/Costa_Rica                   1
Asia/Yekaterinburg                   1
America/Santo_Domingo                1
Asia/Pontianak                       1
Europe/Uzhgorod                      1
America/Argentina/Cordoba            1
Europe/Sofia                         1
Africa/Casablanca                    1
Asia/Manila                          1
Europe/Volgograd                     1
Europe/Skopje                        1
America/Caracas                      1
America/Argentina/Buenos_Aires       1
America/Monterrey                    1
America/St_Kitts                     1
Name: tz, Length: 97, dtype: int64
In [23]:
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'        #处理缺失数据
print(clean_tz)
tz_counts=clean_tz.value_counts()
tz_counts[:10]
0          America/New_York
1            America/Denver
2          America/New_York
3         America/Sao_Paulo
4          America/New_York
5          America/New_York
6             Europe/Warsaw
7                   Unknown
8                   Unknown
9                   Unknown
10      America/Los_Angeles
11         America/New_York
12         America/New_York
13                  Missing
14         America/New_York
15           Asia/Hong_Kong
16           Asia/Hong_Kong
17         America/New_York
18           America/Denver
19              Europe/Rome
20             Africa/Ceuta
21         America/New_York
22         America/New_York
23         America/New_York
24            Europe/Madrid
25        Asia/Kuala_Lumpur
26             Asia/Nicosia
27        America/Sao_Paulo
28                  Unknown
29                  Unknown
               ...         
3530    America/Los_Angeles
3531                Unknown
3532       America/New_York
3533       America/New_York
3534        America/Chicago
3535        America/Chicago
3536                Unknown
3537    America/Tegucigalpa
3538    America/Los_Angeles
3539    America/Los_Angeles
3540         America/Denver
3541    America/Los_Angeles
3542    America/Los_Angeles
3543                Missing
3544        America/Chicago
3545        America/Chicago
3546    America/Los_Angeles
3547       America/New_York
3548        America/Chicago
3549       Europe/Stockholm
3550       America/New_York
3551                Unknown
3552        America/Chicago
3553       America/New_York
3554       America/New_York
3555       America/New_York
3556        America/Chicago
3557         America/Denver
3558    America/Los_Angeles
3559       America/New_York
Name: tz, Length: 3560, dtype: object
Out[23]:
America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64
In [32]:
%pylab
tz_counts[:10].plot(kind='barh',rot=0)
Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x19f6f8d7278>
In [37]:
results=Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
results.value_counts()[:6]    #数一下各有多少个
0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object
Out[37]:
Mozilla/5.0               2594
Mozilla/4.0                601
GoogleMaps/RochesterNY     121
Opera/9.80                  34
TEST_INTERNET_AGENT         24
GoogleProducer              21
dtype: int64
In [38]:
#统计Windows用户
cframe=frame[frame.a.notnull()]
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]
Out[38]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
      dtype='<U11')
In [47]:
by_tz_os=cframe.groupby(['tz',operating_system])
by_tz_os.size().unstack().fillna(0)   # 牛!
Out[47]:
  Not Windows Windows
tz    
  245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0
America/Bogota 1.0 2.0
America/Caracas 0.0 1.0
America/Chicago 115.0 285.0
America/Chihuahua 1.0 1.0
America/Costa_Rica 0.0 1.0
America/Denver 132.0 59.0
America/Edmonton 2.0 4.0
America/Guayaquil 2.0 0.0
America/Halifax 1.0 3.0
America/Indianapolis 8.0 12.0
America/La_Paz 0.0 1.0
America/Lima 0.0 1.0
America/Los_Angeles 130.0 252.0
America/Managua 0.0 3.0
America/Mazatlan 1.0 0.0
America/Mexico_City 7.0 8.0
America/Monterrey 1.0 0.0
America/Montevideo 0.0 1.0
America/Montreal 3.0 6.0
America/New_York 339.0 912.0
... ... ...
Europe/Berlin 9.0 19.0
Europe/Bratislava 1.0 2.0
Europe/Brussels 1.0 3.0
Europe/Bucharest 1.0 3.0
Europe/Budapest 0.0 5.0
Europe/Copenhagen 2.0 3.0
Europe/Dublin 1.0 2.0
Europe/Helsinki 2.0 8.0
Europe/Lisbon 1.0 7.0
Europe/Ljubljana 0.0 1.0
Europe/London 43.0 31.0
Europe/Madrid 16.0 19.0
Europe/Malta 0.0 2.0
Europe/Moscow 1.0 9.0
Europe/Oslo 2.0 8.0
Europe/Paris 4.0 10.0
Europe/Prague 3.0 7.0
Europe/Riga 1.0 1.0
Europe/Rome 8.0 19.0
Europe/Skopje 0.0 1.0
Europe/Sofia 0.0 1.0
Europe/Stockholm 2.0 12.0
Europe/Uzhgorod 0.0 1.0
Europe/Vienna 3.0 3.0
Europe/Vilnius 0.0 2.0
Europe/Volgograd 0.0 1.0
Europe/Warsaw 1.0 15.0
Europe/Zurich 4.0 0.0
Pacific/Auckland 3.0 8.0
Pacific/Honolulu 0.0 36.0

97 rows × 2 columns

In [7]:
#MovieLens 1M数据集
unames=['user_id','gender','age','occupation','zip']
users=pd.read_table('users.dat',sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('ratings.dat',sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  import sys
In [8]:
users.head()
Out[8]:
  user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [9]:
ratings[:5]
Out[9]:
  user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [10]:
movies[:5]
Out[10]:
  movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
In [12]:
data=pd.merge(pd.merge(ratings,users),movies)
data
Out[12]:
  user_id movie_id rating timestamp gender age occupation zip title genres
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 1193 5 978298413 M 56 16 70072 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 1193 4 978220179 M 25 12 32793 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 1193 4 978199279 M 25 7 22903 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 1193 5 978158471 M 50 1 95350 One Flew Over the Cuckoo's Nest (1975) Drama
5 18 1193 4 978156168 F 18 3 95825 One Flew Over the Cuckoo's Nest (1975) Drama
6 19 1193 5 982730936 M 1 10 48073 One Flew Over the Cuckoo's Nest (1975) Drama
7 24 1193 5 978136709 F 25 7 10023 One Flew Over the Cuckoo's Nest (1975) Drama
8 28 1193 3 978125194 F 25 1 14607 One Flew Over the Cuckoo's Nest (1975) Drama
9 33 1193 5 978557765 M 45 3 55421 One Flew Over the Cuckoo's Nest (1975) Drama
10 39 1193 5 978043535 M 18 4 61820 One Flew Over the Cuckoo's Nest (1975) Drama
11 42 1193 3 978038981 M 25 8 24502 One Flew Over the Cuckoo's Nest (1975) Drama
12 44 1193 4 978018995 M 45 17 98052 One Flew Over the Cuckoo's Nest (1975) Drama
13 47 1193 4 977978345 M 18 4 94305 One Flew Over the Cuckoo's Nest (1975) Drama
14 48 1193 4 977975061 M 25 4 92107 One Flew Over the Cuckoo's Nest (1975) Drama
15 49 1193 4 978813972 M 18 12 77084 One Flew Over the Cuckoo's Nest (1975) Drama
16 53 1193 5 977946400 M 25 0 96931 One Flew Over the Cuckoo's Nest (1975) Drama
17 54 1193 5 977944039 M 50 1 56723 One Flew Over the Cuckoo's Nest (1975) Drama
18 58 1193 5 977933866 M 25 2 30303 One Flew Over the Cuckoo's Nest (1975) Drama
19 59 1193 4 977934292 F 50 1 55413 One Flew Over the Cuckoo's Nest (1975) Drama
20 62 1193 4 977968584 F 35 3 98105 One Flew Over the Cuckoo's Nest (1975) Drama
21 80 1193 4 977786172 M 56 1 49327 One Flew Over the Cuckoo's Nest (1975) Drama
22 81 1193 5 977785864 F 25 0 60640 One Flew Over the Cuckoo's Nest (1975) Drama
23 88 1193 5 977694161 F 45 1 02476 One Flew Over the Cuckoo's Nest (1975) Drama
24 89 1193 5 977683596 F 56 9 85749 One Flew Over the Cuckoo's Nest (1975) Drama
25 95 1193 5 977626632 M 45 0 98201 One Flew Over the Cuckoo's Nest (1975) Drama
26 96 1193 3 977621789 F 25 16 78028 One Flew Over the Cuckoo's Nest (1975) Drama
27 99 1193 2 982791053 F 1 10 19390 One Flew Over the Cuckoo's Nest (1975) Drama
28 102 1193 5 1040737607 M 35 19 20871 One Flew Over the Cuckoo's Nest (1975) Drama
29 104 1193 2 977546620 M 25 12 00926 One Flew Over the Cuckoo's Nest (1975) Drama
... ... ... ... ... ... ... ... ... ... ...
1000179 4933 3084 3 962757020 M 25 15 94040 Home Page (1999) Documentary
1000180 4802 2218 2 1014866656 M 56 1 40601 Juno and Paycock (1930) Drama
1000181 4812 2308 2 962932391 M 18 14 25301 Detroit 9000 (1973) Action|Crime
1000182 4874 624 4 962781918 F 25 4 70808 Condition Red (1995) Action|Drama|Thriller
1000183 5059 1434 4 962484364 M 45 16 22652 Stranger, The (1994) Action
1000184 5947 1434 4 957190428 F 45 16 97215 Stranger, The (1994) Action
1000185 5077 1868 3 962417299 M 25 2 20037 Truce, The (1996) Drama|War
1000186 5944 1868 1 957197520 F 18 10 27606 Truce, The (1996) Drama|War
1000187 5105 404 3 962337582 M 50 7 18977 Brother Minister: The Assassination of Malcolm... Documentary
1000188 5185 404 4 963402617 F 35 4 44485 Brother Minister: The Assassination of Malcolm... Documentary
1000189 5532 404 5 959619841 M 25 17 27408 Brother Minister: The Assassination of Malcolm... Documentary
1000190 5543 404 3 960127592 M 25 17 97401 Brother Minister: The Assassination of Malcolm... Documentary
1000191 5220 2543 3 961546137 M 25 7 91436 Six Ways to Sunday (1997) Comedy
1000192 5754 2543 4 958272316 F 18 1 60640 Six Ways to Sunday (1997) Comedy
1000193 5227 591 3 961475931 M 18 10 64050 Tough and Deadly (1995) Action|Drama|Thriller
1000194 5795 591 1 958145253 M 25 1 92688 Tough and Deadly (1995) Action|Drama|Thriller
1000195 5313 3656 5 960920392 M 56 0 55406 Lured (1947) Crime
1000196 5328 2438 4 960838075 F 25 4 91740 Outside Ozona (1998) Drama|Thriller
1000197 5334 3323 3 960796159 F 56 13 46140 Chain of Fools (2000) Comedy|Crime
1000198 5334 127 1 960795494 F 56 13 46140 Silence of the Palace, The (Saimt el Qusur) (1... Drama
1000199 5334 3382 5 960796159 F 56 13 46140 Song of Freedom (1936) Drama
1000200 5420 1843 3 960156505 F 1 19 14850 Slappy and the Stinkers (1998) Children's|Comedy
1000201 5433 286 3 960240881 F 35 17 45014 Nemesis 2: Nebula (1995) Action|Sci-Fi|Thriller
1000202 5494 3530 4 959816296 F 35 17 94306 Smoking/No Smoking (1993) Comedy
1000203 5556 2198 3 959445515 M 45 6 92103 Modulations (1998) Documentary
1000204 5949 2198 5 958846401 M 18 17 47901 Modulations (1998) Documentary
1000205 5675 2703 3 976029116 M 35 14 30030 Broken Vessels (1998) Drama
1000206 5780 2845 1 958153068 M 18 17 92886 White Boys (1999) Drama
1000207 5851 3607 5 957756608 F 18 20 55410 One Little Indian (1973) Comedy|Drama|Western
1000208 5938 2909 4 957273353 M 25 1 35401 Five Wives, Three Secretaries and Me (1998) Documentary

1000209 rows × 10 columns

In [13]:
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings.head()
Out[13]:
gender F M
title    
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
In [23]:
ratings_by_title=data.groupby('title').size()  #根据电影名字分组
ratings_by_title[:10]
Out[23]:
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64
In [31]:
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles       #评分数据大于250条的电影名称
Out[31]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)
In [39]:
mean_ratings=mean_ratings.loc[active_titles]
mean_ratings
Out[39]:
gender F M
title    
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
13th Warrior, The (1999) 3.112000 3.168000
2 Days in the Valley (1996) 3.488889 3.244813
20,000 Leagues Under the Sea (1954) 3.670103 3.709205
2001: A Space Odyssey (1968) 3.825581 4.129738
2010 (1984) 3.446809 3.413712
28 Days (2000) 3.209424 2.977707
39 Steps, The (1935) 3.965517 4.107692
54 (1998) 2.701754 2.782178
7th Voyage of Sinbad, The (1958) 3.409091 3.658879
8MM (1999) 2.906250 2.850962
About Last Night... (1986) 3.188679 3.140909
Absent Minded Professor, The (1961) 3.469388 3.446809
Absolute Power (1997) 3.469136 3.327759
Abyss, The (1989) 3.659236 3.689507
Ace Ventura: Pet Detective (1994) 3.000000 3.197917
Ace Ventura: When Nature Calls (1995) 2.269663 2.543333
Addams Family Values (1993) 3.000000 2.878531
Addams Family, The (1991) 3.186170 3.163498
Adventures in Babysitting (1987) 3.455782 3.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984) 3.308511 3.402321
Adventures of Priscilla, Queen of the Desert, The (1994) 3.989071 3.688811
Adventures of Robin Hood, The (1938) 4.166667 3.918367
African Queen, The (1951) 4.324232 4.223822
Age of Innocence, The (1993) 3.827068 3.339506
Agnes of God (1985) 3.534884 3.244898
... ... ...
White Men Can't Jump (1992) 3.028777 3.231061
Who Framed Roger Rabbit? (1988) 3.569378 3.713251
Who's Afraid of Virginia Woolf? (1966) 4.029703 4.096939
Whole Nine Yards, The (2000) 3.296552 3.404814
Wild Bunch, The (1969) 3.636364 4.128099
Wild Things (1998) 3.392000 3.459082
Wild Wild West (1999) 2.275449 2.131973
William Shakespeare's Romeo and Juliet (1996) 3.532609 3.318644
Willow (1988) 3.658683 3.453543
Willy Wonka and the Chocolate Factory (1971) 4.063953 3.789474
Witness (1985) 4.115854 3.941504
Wizard of Oz, The (1939) 4.355030 4.203138
Wolf (1994) 3.074074 2.899083
Women on the Verge of a Nervous Breakdown (1988) 3.934307 3.865741
Wonder Boys (2000) 4.043796 3.913649
Working Girl (1988) 3.606742 3.312500
World Is Not Enough, The (1999) 3.337500 3.388889
Wrong Trousers, The (1993) 4.588235 4.478261
Wyatt Earp (1994) 3.147059 3.283898
X-Files: Fight the Future, The (1998) 3.489474 3.493797
X-Men (2000) 3.682310 3.851702
Year of Living Dangerously (1982) 3.951220 3.869403
Yellow Submarine (1968) 3.714286 3.689286
You've Got Mail (1998) 3.542424 3.275591
Young Frankenstein (1974) 4.289963 4.239177
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086

1216 rows × 2 columns

In [45]:
top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]      #女性最喜欢的十大电影排行
Out[45]:
gender F M
title    
Close Shave, A (1995) 4.644444 4.473795
Wrong Trousers, The (1993) 4.588235 4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Schindler's List (1993) 4.562602 4.491415
Shawshank Redemption, The (1994) 4.539075 4.560625
Grand Day Out, A (1992) 4.537879 4.293255
To Kill a Mockingbird (1962) 4.536667 4.372611
Creature Comforts (1990) 4.513889 4.272277
Usual Suspects, The (1995) 4.513317 4.518248
In [51]:
#找出男女分歧最大的电影
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff=mean_ratings.sort_values(by='diff')     #sort_values排序
print(sorted_by_diff[:5])
print(sorted_by_diff[::-1][:5])
sorted_by_diff[-5:]           #男女差别较大且 男性更喜欢的电影
gender                            F         M      diff
title                                                  
Dirty Dancing (1987)       3.790378  2.959596 -0.830782
Jumpin' Jack Flash (1986)  3.254717  2.578358 -0.676359
Grease (1978)              3.975265  3.367041 -0.608224
Little Women (1994)        3.870588  3.321739 -0.548849
Steel Magnolias (1989)     3.901734  3.365957 -0.535777
gender                                         F         M      diff
title                                                               
Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
Dumb & Dumber (1994)                    2.697987  3.336595  0.638608
Longest Day, The (1962)                 3.411765  4.031447  0.619682
Cable Guy, The (1996)                   2.250000  2.863787  0.613787
Out[51]:
gender F M diff
title      
Cable Guy, The (1996) 2.250000 2.863787 0.613787
Longest Day, The (1962) 3.411765 4.031447 0.619682
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
In [57]:
#不考虑性别因素 只找出分歧最大的电影
rating_std_by_title=data.groupby('title')['rating'].std()   
print(rating_std_by_title[:5])       #每个电影打分的方差
rating_std_by_title=rating_std_by_title.loc[active_titles]
print(type(rating_std_by_title))
rating_std_by_title.sort_values(ascending=False)[:5]
title
$1,000,000 Duck (1971)           1.092563
'Night Mother (1986)             1.118636
'Til There Was You (1997)        1.020159
'burbs, The (1989)               1.107760
...And Justice for All (1979)    0.878110
Name: rating, dtype: float64
<class 'pandas.core.series.Series'>
Out[57]:
title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Name: rating, dtype: float64
In [1]:
#全美婴儿姓名分析
names1880=pd.read_csv('yob1880.txt',names=['name','sex','births'])
names1880.head()
Out[1]:
  name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
In [62]:
names1880.groupby('sex')['births'].sum()
Out[62]:
sex
F     90993
M    110493
Name: births, dtype: int64
In [2]:
years=range(1880,2011)
pieces=[]
columns=['names','sex','births']
for year in years:
    path='yob%d.txt' %year
    frame=pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)
#print(pieces)
names=pd.concat(pieces,ignore_index=True) 
In [78]:
names.head()
Out[78]:
  names sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
In [3]:
total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.head()
Out[3]:
sex F M
year    
1880 90993 110493
1881 91955 100748
1882 107851 113687
1883 112322 104632
1884 129021 114445
In [87]:
import matplotlib.pyplot as plt
total_births.plot(title="Total births by sex and year")
plt.show()
pandas—举例
pandas—举例

In [29]:
def add_prop(group):   #名字占比
    births=group.births.astype(float)
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop)
names.head()
Out[29]:
  names sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
In [31]:
def get_top1000(group):    #获取每年各男女前1000的
    return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000[:10]
Out[31]:
      names sex births year prop
year sex            
1880 F 0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
5 Margaret F 1578 1880 0.017342
6 Ida F 1472 1880 0.016177
7 Alice F 1414 1880 0.015540
8 Bertha F 1320 1880 0.014507
9 Sarah F 1288 1880 0.014155
In [32]:
boys=top1000[top1000['sex']=='M']
girls=top1000[top1000['sex']=='F']
total_births=top1000.pivot_table('births',index='year',columns='names',aggfunc=sum)
total_births.head()   #每年叫这些名字的人数
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
Out[32]:
names Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie ... Zoa Zoe Zoey Zoie Zola Zollie Zona Zora Zula Zuri
year                                          
1880 NaN NaN NaN 102.0 NaN NaN NaN NaN NaN 71.0 ... 8.0 23.0 NaN NaN 7.0 NaN 8.0 28.0 27.0 NaN
1881 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 81.0 ... NaN 22.0 NaN NaN 10.0 NaN 9.0 21.0 27.0 NaN
1882 NaN NaN NaN 85.0 NaN NaN NaN NaN NaN 80.0 ... 8.0 25.0 NaN NaN 9.0 NaN 17.0 32.0 21.0 NaN
1883 NaN NaN NaN 105.0 NaN NaN NaN NaN NaN 79.0 ... NaN 23.0 NaN NaN 10.0 NaN 11.0 35.0 25.0 NaN
1884 NaN NaN NaN 97.0 NaN NaN NaN NaN NaN 98.0 ... 13.0 31.0 NaN NaN 14.0 6.0 8.0 58.0 27.0 NaN

5 rows × 6868 columns

In [33]:
import matplotlib.pyplot as plt
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=True,title="Number of births per year")
plt.show()
pandas—举例
pandas—举例

In [39]:
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum) #前1000名字占比
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show()   #每年前1000个名字占比逐渐减少 说明命名更加多样化
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
pandas—举例
pandas—举例

In [44]:
df=boys[boys['year']==2010]
df[:5]
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677645)
Data columns (total 5 columns):
names     1000 non-null object
sex       1000 non-null object
births    1000 non-null int64
year      1000 non-null int64
prop      1000 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.0+ MB
In [52]:
#最后一个字母的变革
get_last_letter=lambda x:x[-1]
last_letters=names.names.map(get_last_letter)   #map匿名函数
last_letters.name='last_letter'
last_letters.head()
Out[52]:
0    y
1    a
2    a
3    h
4    e
Name: last_letter, dtype: object
In [55]:
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
Out[55]:
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter            
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b NaN 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
In [56]:
subtable.sum()   #男女各年总数
Out[56]:
sex  year
F    1910     396416.0
     1960    2022062.0
     2010    1759010.0
M    1910     194198.0
     1960    2132588.0
     2010    1898382.0
dtype: float64
In [59]:
pandas—举例
letter_prop=subtable/subtable.sum().astype(float)
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
plt.show()
pandas—举例

相关文章:

  • 2022-12-23
  • 2021-12-14
  • 2022-12-23
  • 2021-08-14
  • 2022-01-07
  • 2021-11-16
  • 2021-05-17
  • 2022-01-11
猜你喜欢
  • 2021-11-03
  • 2021-10-19
  • 2021-12-16
  • 2022-02-10
  • 2021-05-20
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案