# coding:utf-8
__author__ = 'similarface'
#datalink=http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip
'''
BX-Users["User-ID";"Location";"Age"]
BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
BX-Book-Ratings["User-ID";"ISBN";"Book-Rating"]
'''
#专门用作编码转换
import codecs, os, sys
from math import sqrt
users = {
"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0,
"Slightly Stoopid": 1.5,
"The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5,
"Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
"Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5,
"The Strokes": 4.0, "Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5,
"The Strokes": 4.0, "Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0,
"The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}
class recommender:
def __init__(self, data, k=1, metric='pearson', n=5):
self.k = k
self.n = n
self.username2id = {}
self.userid2name = {}
self.productid2name = {}
self.metric = metric
if self.metric == 'pearson':
self.fn = self.pearson
if type(data).__name__ == 'dict':
self.data = data
def loadBookDB(self, path=''):
self.data = {}
i = 0
#读取用户评分书籍的数据
f = codecs.open(os.path.join(path, 'BX-Book-Ratings.csv'), 'r', 'utf-8',errors='ignore')
for line in f:
i = i + 1
fields = line.split(';')
user = fields[0].strip('"')
book = fields[1].strip('"')
try:
rating = int(fields[2].strip().strip('"'))
except ValueError:
continue
if user in self.data:
currentRatings = self.data[user]
else:
currentRatings = {}
currentRatings[book] = rating
self.data[user] = currentRatings
f.close()
#读取书籍的信息
f = codecs.open(os.path.join(path, 'BX-Books.csv'), 'r', 'utf8',errors='ignore')
for line in f:
i += 1
fields = line.split(';')
#BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
isbn = fields[0].strip('"')
title = fields[1].strip('"')
author = fields[2].strip('"')
title = title + 'by' + author
self.productid2name[isbn] = title
f.close()
#读取用户的信息
f = codecs.open(os.path.join(path, 'BX-Users.csv'), 'r', 'utf8',errors='ignore')
for line in f:
i += 1
fields = line.split(';')
userid = fields[0].strip('"')
location = fields[1].strip('"')
if len(fields) > 3:
age = fields[2].strip().strip('"')
else:
age = 'NULL'
if age != 'NULL':
value = location + ' (age: ' + age + ')'
else:
value = location
self.userid2name[userid] = value
self.username2id[location] = userid
f.close()
print(i)
def pearson(self, rating1, rating2):
'''
皮尔逊相关参数
在统计学中,皮尔逊积矩相关系数
(英语:Pearson product-moment correlation coefficient,
又称作 PPMCC或PCCs[1],
文章中常用r或Pearson's r表示)
用于度量两个变量X和Y之间的相关(线性相关),其值介于-1与1之间。
在自然科学领域中,该系数广泛用于度量两个变量之间的相关程度。
0.8-1.0 极强相关
0.6-0.8 强相关
0.4-0.6 中等程度相关
0.2-0.4 弱相关
0.0-0.2 极弱相关或无相关
'''
sum_xy, sum_x, sum_y, sum_x2, sum_y2, n = 0, 0, 0, 0, 0, 0
for key in rating1:
if key in rating2:
n = n + 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += x ** 2
sum_y2 += y ** 2
if n == 0:
return 0
fenmu = sqrt(sum_x2 - (sum_x ** 2) / n) * sqrt(sum_y2 - (sum_y ** 2) / n)
if fenmu == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / fenmu
def computeNearesNeighbor(self, username):
'''
计算关系系数
'''
distinces = []
for instance in self.data:
if instance != username:
#相关系数
distince = self.fn(self.data[username], self.data[instance])
distinces.append((instance, distince))
distinces.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
return distinces
def recommend(self, user):
recommendations = {}
nearest = self.computeNearesNeighbor(user)
userRating = self.data[user]
totalDistance = 0.0
for i in range(self.k):
totalDistance += nearest[i][1]
for i in range(self.k):
weight = nearest[i][1] / totalDistance
name = nearest[i][0]
neighborRatings = self.data[name]
#遍历相关性高的用户喜欢的书籍
for artist in neighborRatings:
#如果喜欢的书不在推荐用户的书籍中
if not artist in userRating:
#文章是否存在评级
if artist not in recommendations:
recommendations[artist] = (neighborRatings[artist] * weight)
else:
recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight)
recommendations = list(recommendations.items())
recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations]
recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
return recommendations[:self.n]
def convertProductID2name(self, id):
'''
给定商品编号返回商品名称
'''
if id in self.productid2name:
return self.productid2name[id]
else:
return id
def userRatings(self, id, n):
'''
返回前n条的与用户id相关的
:param id:
:param n:
:return:
'''
print("Ratings for " + self.userid2name[id])
ratings = self.data[id]
print(len(ratings))
ratings = list(ratings.items())
ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings]
ratings.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
ratings = ratings[:n]
for rating in ratings:
print("%s\t%i" % (rating[0], rating[1]))
if __name__ == '__main__':
r = recommender(users)
print(r.recommend('Veronica'))
r.loadBookDB(u'D:/360安全浏览器下载/BX-CSV-Dump')
print(r.recommend('276737'))