数据分析实例-MovieLens 1M 数据集

MovieLens 1M数据集含有来自6000名用户对4000部电影的100万条评分数据。分为三个表：评分，用户信息，电影信息。这些数据都是dat文件格式。

读取3个数据集：

#coding=gbk
# MovieLens 1M数据集含有来自6000名用户对4000部电影的100万条评分数据。
# 分为三个表：评分，用户信息，电影信息。这些数据都是dat文件格式
# ，可以通过pandas.read_table将各个表分别读到一个pandas DataFrame对象中
import pandas as pd
import time 
start = time.clock()
filename1 =r\'D:\datasets\users.dat\'
filename2 = r\'D:\datasets\ratings.dat\'
filename3 = r\'D:\datasets\movies.dat\'
pd.options.display.max_rows = 10
uname = [\'user_id\',\'gender\',\'age\',\'occupation\',\'zip\']
users = pd.read_table(filename1, sep=\'::\', header = None, names=uname, engine=\'python\')
print(users.head()) #年龄和职业都是使用编码的形式给出来的
#    user_id gender  age  occupation    zip
# 0        1      F    1          10  48067
# 1        2      M   56          16  70072
# 2        3      M   25          15  55117
# 3        4      M   45           7  02460
# 4        5      M   25          20  55455
print(users.shape)  # (6040, 5)

rnames = [\'user_id\',\'movie_id\',\'rating\',\'timestamp\']
ratings = pd.read_table(filename2, header =None, sep=\'::\',names=rnames, engine= \'python\')
print(ratings.head())
#    user_id  movie_id  rating  timestamp
# 0        1      1193       5  978300760
# 1        1       661       3  978302109
# 2        1       914       3  978301968
# 3        1      3408       4  978300275
# 4        1      2355       5  978824291
# print(ratings.shape)  #(1000209, 4)
mnames = [\'movie_id\',\'title\',\'genres\']  # genres 表示影片的体裁是什么
movies = pd.read_table(filename3, header = None, sep=\'::\', names = mnames, engine=\'python\')
# print(movies.head())
#    movie_id                               title                        genres
# 0         1                    Toy Story (1995)   Animation|Children\'s|Comedy
# 1         2                      Jumanji (1995)  Adventure|Children\'s|Fantasy
# 2         3             Grumpier Old Men (1995)                Comedy|Romance
# 3         4            Waiting to Exhale (1995)                  Comedy|Drama
# 4         5  Father of the Bride Part II (1995)                        Comedy
# print(movies.shape) #(3883, 3)

年龄和职业都是使用编码的形式给出来的：

- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

使用merge 函数将3个表进行合并

#使用merge 函数将3个表进行合并
data = pd.merge(pd.merge(ratings, users), movies)
# print(data.head())
#    user_id  movie_id  rating  timestamp gender  age  occupation    zip  \..
# 0        1      1193       5  978300760      F    1          10  48067   
# 1        2      1193       5  978298413      M   56          16  70072   
# 2       12      1193       4  978220179      M   25          12  32793   
# 3       15      1193       4  978199279      M   25           7  22903   
# 4       17      1193       5  978158471      M   50           1  95350 
# print(data.iloc[0])
# user_id                                            1
# movie_id                                        1193
# rating                                             5
# timestamp                                  978300760
# gender                                             F
# age                                                1
# occupation                                        10
# zip                                            48067
# title         One Flew Over the Cuckoo\'s Nest (1975)
# genres                                         Drama
# Name: 0, dtype: object

使用透视表，按性别计算每部电影的平均得分

#index  表示索引，values表示所要进行分析的数据， columns允许选择一个或多个列,以columns作为分组的列
mean_ratings = data.pivot_table(values =\'rating\', index=\'title\', columns =\'gender\', aggfunc=\'mean\')
# print(mean_ratings.head())
# gender                                F         M
# title                                            
# $1,000,000 Duck (1971)         3.375000  2.761905
# \'Night Mother (1986)           3.388889  3.352941
# \'Til There Was You (1997)      2.675676  2.733333
# \'burbs, The (1989)             2.793478  2.962085
# ...And Justice for All (1979)  3.828571  3.689024

使用选择的数据进行分析

#过滤掉评分数据不足250 条的电影
ratings_by_title = data.groupby(\'title\').size()
print(ratings_by_title[:3])
# title
# $1,000,000 Duck (1971)       37
# \'Night Mother (1986)         70
# \'Til There Was You (1997)    52
# dtype: int64
active_titles = ratings_by_title.index[ratings_by_title >= 250] #找出其评论大于250 的索引
print(active_titles[:3])
# Index([\'\'burbs, The (1989)\', \'10 Things I Hate About You (1999)\',
#        \'101 Dalmatians (1961)\'],
#       dtype=\'object\', name=\'title\')

#可以以active_titles 中的电影作为索引，选择出 mean_ratings 中的电影
mean_ratings = mean_ratings.loc[active_titles]
print(mean_ratings[:5])
# gender                                    F         M
# title                                                
# \'burbs, The (1989)                 2.793478  2.962085
# 10 Things I Hate About You (1999)  3.646552  3.311966
# 101 Dalmatians (1961)              3.791444  3.500000
# 101 Dalmatians (1996)              3.240000  2.911215
# 12 Angry Men (1957)                4.184397  4.328421

#查看女性观众喜欢的电影，可以按 F 列进行降序排列
top_ratings = mean_ratings.sort_values(by="F", ascending = False)
print(top_ratings[:3])
# gender                                                F         M
# title                                                            
# Close Shave, A (1995)                          4.644444  4.473795
# Wrong Trousers, The (1993)                     4.588235  4.478261
# Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)  4.572650  4.464589

#计算男性观众和女性观众分歧最大的电影
mean_ratings[\'diff\'] = mean_ratings[\'M\'] - mean_ratings[\'F\']
sort_by_diff = mean_ratings.sort_values(by=\'diff\')
print(sort_by_diff[:3])
# gender                            F         M      diff
# title                                                  
# Dirty Dancing (1987)       3.790378  2.959596 -0.830782
# Jumpin\' Jack Flash (1986)  3.254717  2.578358 -0.676359
# Grease (1978)              3.975265  3.367041 -0.608224
#对行进行反序操作， 取出前3行，得到是男性更喜欢的电影，而女性观众相反
print(sort_by_diff[::-1][:3])
# gender                                         F         M      diff
# title                                                               
# Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
# Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
# Dumb & Dumber (1994)                    2.697987  3.336595  0.638608

#计算得分数据的标准差，找出分歧最大的电影
rating_std = data.groupby(\'title\')[\'rating\'].std()
rating_std = rating_std.loc[active_titles]
print(rating_std.sort_values(ascending=False)[:3])
# title
# Dumb & Dumber (1994)               1.321333
# Blair Witch Project, The (1999)    1.316368
# Natural Born Killers (1994)        1.307198
# Name: rating, dtype: float64

end = time.clock()
spending_time = end - start
print(\'花费的时间为：%.2f\'%spending_time + \'s\')
# 花费的时间为：11.13s