数据挖掘之协同过滤

# coding:utf-8
__author__ = 'similarface'
#datalink=http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip
'''
BX-Users["User-ID";"Location";"Age"]
BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
BX-Book-Ratings["User-ID";"ISBN";"Book-Rating"]
'''

#专门用作编码转换
import codecs, os, sys
from math import sqrt

users = {
    "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0,
                 "Slightly Stoopid": 1.5,
                 "The Strokes": 2.5, "Vampire Weekend": 2.0},
    "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5,
             "Vampire Weekend": 3.0},
    "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
             "Slightly Stoopid": 1.0},
    "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5,
            "The Strokes": 4.0, "Vampire Weekend": 2.0},
    "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
    "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5,
               "The Strokes": 4.0, "Vampire Weekend": 4.0},
    "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0,
            "The Strokes": 5.0},
    "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}


class recommender:
    def __init__(self, data, k=1, metric='pearson', n=5):
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        if type(data).__name__ == 'dict':
            self.data = data

    def loadBookDB(self, path=''):
        self.data = {}
        i = 0
        #读取用户评分书籍的数据
        f = codecs.open(os.path.join(path, 'BX-Book-Ratings.csv'), 'r', 'utf-8',errors='ignore')
        for line in f:
            i = i + 1
            fields = line.split(';')
            user = fields[0].strip('"')
            book = fields[1].strip('"')
            try:
                rating = int(fields[2].strip().strip('"'))
            except ValueError:
                continue
            if user in self.data:
                currentRatings = self.data[user]
            else:
                currentRatings = {}
            currentRatings[book] = rating
            self.data[user] = currentRatings
        f.close()
        #读取书籍的信息
        f = codecs.open(os.path.join(path, 'BX-Books.csv'), 'r', 'utf8',errors='ignore')
        for line in f:
            i += 1
            fields = line.split(';')
            #BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
            isbn = fields[0].strip('"')
            title = fields[1].strip('"')
            author = fields[2].strip('"')
            title = title + 'by' + author
            self.productid2name[isbn] = title
        f.close()

        #读取用户的信息
        f = codecs.open(os.path.join(path, 'BX-Users.csv'), 'r', 'utf8',errors='ignore')
        for line in f:
            i += 1
            fields = line.split(';')
            userid = fields[0].strip('"')
            location = fields[1].strip('"')
            if len(fields) > 3:
                age = fields[2].strip().strip('"')
            else:
                age = 'NULL'
            if age != 'NULL':
                value = location + ' (age: ' + age + ')'
            else:
                value = location
            self.userid2name[userid] = value
            self.username2id[location] = userid
        f.close()
        print(i)


    def pearson(self, rating1, rating2):
        '''
        皮尔逊相关参数
        在统计学中，皮尔逊积矩相关系数
        （英语：Pearson product-moment correlation coefficient，
        又称作 PPMCC或PCCs[1],
        文章中常用r或Pearson's r表示）
        用于度量两个变量X和Y之间的相关（线性相关），其值介于-1与1之间。
        在自然科学领域中，该系数广泛用于度量两个变量之间的相关程度。
        0.8-1.0 极强相关
        0.6-0.8 强相关
        0.4-0.6 中等程度相关
        0.2-0.4 弱相关
        0.0-0.2 极弱相关或无相关
        '''
        sum_xy, sum_x, sum_y, sum_x2, sum_y2, n = 0, 0, 0, 0, 0, 0
        for key in rating1:
            if key in rating2:
                n = n + 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += x ** 2
                sum_y2 += y ** 2
        if n == 0:
            return 0
        fenmu = sqrt(sum_x2 - (sum_x ** 2) / n) * sqrt(sum_y2 - (sum_y ** 2) / n)
        if fenmu == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / fenmu


    def computeNearesNeighbor(self, username):
        '''
        计算关系系数
        '''
        distinces = []
        for instance in self.data:
            if instance != username:
                #相关系数
                distince = self.fn(self.data[username], self.data[instance])
                distinces.append((instance, distince))
        distinces.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
        return distinces

    def recommend(self, user):
        recommendations = {}
        nearest = self.computeNearesNeighbor(user)
        userRating = self.data[user]
        totalDistance = 0.0
        for i in range(self.k):
            totalDistance += nearest[i][1]
        for i in range(self.k):
            weight = nearest[i][1] / totalDistance
            name = nearest[i][0]
            neighborRatings = self.data[name]
            #遍历相关性高的用户喜欢的书籍
            for artist in neighborRatings:
                #如果喜欢的书不在推荐用户的书籍中
                if not artist in userRating:
                    #文章是否存在评级
                    if artist not in recommendations:
                        recommendations[artist] = (neighborRatings[artist] * weight)
                    else:
                        recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight)
        recommendations = list(recommendations.items())
        recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations]
        recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
        return recommendations[:self.n]

    def convertProductID2name(self, id):
        '''
        给定商品编号返回商品名称
        '''
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id

    def userRatings(self, id, n):
        '''
        返回前n条的与用户id相关的
        :param id:
        :param n:
        :return:
        '''
        print("Ratings for " + self.userid2name[id])
        ratings = self.data[id]
        print(len(ratings))
        ratings = list(ratings.items())
        ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings]
        ratings.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
        ratings = ratings[:n]
        for rating in ratings:
            print("%s\t%i" % (rating[0], rating[1]))


if __name__ == '__main__':
    r = recommender(users)
    print(r.recommend('Veronica'))
    r.loadBookDB(u'D:/360安全浏览器下载/BX-CSV-Dump')
    print(r.recommend('276737'))

#result:
[('Blues Traveler', 5.0)]
1700021
[(u"Devil's Waltz (Alex Delaware Novels (Paperback))byJonathan Kellerman", 9.0), (u'Silent Partner (Alex Delaware Novels (Paperback))byJonathan Kellerman', 8.0), (u'The Outsiders (Now in Speak!)byS. E. Hinton', 8.0), (u'Sein LanguagebyJERRY SEINFELD', 8.0), (u'The Girl Who Loved Tom GordonbyStephen King', 8.0)]