0001"""Models for prediction of user ratings of items using collaborative filtering.
0002
0003The classes `DistanceModel`, `SimpleSimilarityModel`, and `SMSimilarityModel`
0004use different methods for judging the similarity of two users given their
0005rating histories.
0006
0007The `getRecommendations` method for the above classes returns a list of
0008items and their scores (how relevant they are to the given user), sorted
0009from most recommended to least recommended. It does this by weighting other
0010users' preferences by their similarity to the user for whom recommendations
0011are being requested.
0012
0013The example below uses the AudioScrobbler service to obtain preference data,
0014then uses `getRecommendations` to suggest artists for a given user.
0015
0016Usage
0017-----
0018>>> from consensus import DistanceModel, SimpleSimilarityModel, SMSimilarityModel
0019>>> def loadPreferences():
0020...    d = shelve.open("scrobbler.pickle")
0021...    judgeRatings = {}
0022...    for user, artistsInfo in d.iteritems():
0023...        judgeRatings[user] = {}
0024...        for artistName, playCount in artistsInfo:
0025...            judgeRatings[user][artistName] = playCount
0026...    return judgeRatings
0027
0028>>> judgeRatings = loadPreferences()
0029>>> distance = DistanceModel(judgeRatings)
0030>>> similarity = SimpleSimilarityModel(judgeRatings)
0031>>> sm_similarity = SMSimilarityModel(judgeRatings)
0032
0033>>> [artist for artist, score in distance.getRecommendations('eggs_again', limit=10)]
0034[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'Belle and Sebastian',
0035 u'The Beatles', u'Wilco', u'The Decemberists', u'Interpol', u'Bright Eyes', u'Beck']
0036
0037>>> [artist for artist, score in similarity.getRecommendations('eggs_again', limit=10)]
0038[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'The Beatles',
0039 u'Belle and Sebastian', u'Wilco', u'Interpol', u'The Decemberists', u'Beck',
0040 u'Bright Eyes']
0041
0042>>> [artist for artist, score in sm_similarity.getRecommendations('eggs_again', limit=10)]
0043[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'Belle and Sebastian',
0044 u'The Beatles', u'Interpol', u'Wilco', u'The Decemberists', u'Bright Eyes',
0045 u'Death Cab for Cutie']
0046"""
0047import math
0048from operator import itemgetter
0049
0050class Model(object):
0051    """The base class for all collaborative filtering models.
0052    Not usable unles subclassed.
0053    """
0054    def __init__(self, judgeRatings=None, judges=None):
0055        """Initialize the model with a group of judges and their rating histories
0056
0057        `judgeRatings` is a dictionary mapping judges to their rating histories.
0058        Rating histories are represented as dictionaries mapping items to their
0059        rating by that judge. If a list `judges` is given as a list of judges,
0060        `judgeRatings` will be initialized with empty rating histories.
0061        """
0062        if judgeRatings:
0063            self.judgeRatings = judgeRatings
0064        elif judges:
0065            self.judgeRatings = dict((j, {}) for j in judges)
0066        else:
0067            self.judgeRatings = {}
0068
0069    def similarity(self, j1, j2, neutralScore=0, items=None):
0070        """Calculate the similarity of judges j1 and j2.
0071
0072        If `neutralScore` is given, it is the midpoint of the rating scale
0073        being used.
0074
0075        If a list of `items` is given, the judges' ratings for only those
0076        items will be considered when determining similarity.
0077        """
0078        raise NotImplementedError("Subclasses must implement a similarity function")
0079
0080    def getNeighbors(self, judge, limit=None, threshold=None, neutralScore=0,
0081                     items=None):
0082        """Get a sorted list of neighbors for the given judge, where neighbors
0083        are defined as judges with preferences most similar to the given judge.
0084
0085        `judge` is the user for whom a list of neighbors will be calculated.
0086
0087        If `limit` is given, it will limit the length of the list of neighbors
0088        being returned. If `threshold` is given, it specifies the cutoff point
0089        for which neighbors will be included. This will be highly dependent on
0090        which model is chosen. A lower `threshold` will result in fewer
0091        neighbors being returned.
0092
0093        `neutralScore` and `items` are as defined in `Model.similarity`.
0094        """
0095        neighbors = []
0096        for j in self.judgeRatings:
0097            if j == judge:
0098                continue
0099            similarity = self.similarity(j, judge, neutralScore, items)
0100            if threshold and similarity > threshold:
0101                continue
0102            neighbors.append((j, similarity))
0103        return sorted(neighbors, key=itemgetter(1))[:limit]
0104
0105    def getRecommendations(self, judge, limit=None, neutralScore=0,
0106                           items=None):
0107        """Return a list of items and their scores (how relevant they are to
0108        the given judge), sorted from most recommended to least recommended.
0109
0110        `limit`, `neutralScore`, and `items` are all as defined in
0111        `Model.getNeighbors`.
0112        """
0113        recommendScores = {}
0114        for j, ratings in self.judgeRatings.iteritems():
0115            for i, rating in ratings.iteritems():
0116                if rating > neutralScore and i not in self.judgeRatings[judge]:
0117                    recommendScores.setdefault(i, 0)
0118                    recommendScores[i] += rating / self.similarity(j, judge,
0119                                                                   neutralScore,
0120                                                                   items)
0121
0122        return sorted(((i, s) for (i, s) in recommendScores.iteritems()),
0123                      key=itemgetter(1), reverse=True)[:limit]
0124
0125class DistanceModel(Model):
0126    """Uses the vector distance in n-dimensional space defined by the users' ratings for n items"""
0127    def similarity(self, j1, j2, neutralScore=0, items=None):
0128        """See `Model.similarity` for argument information."""
0129        if items is None:
0130            items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0131        return math.sqrt(sum(pow(self.judgeRatings[j1].get(i, neutralScore) -                                    self.judgeRatings[j2].get(i, neutralScore),
0133                                 2) for i in items))
0134
0135class SimpleSimilarityModel(Model):
0136    """Uses a simple function where a larger intersection of preferred items results in greater similarity"""
0137    def similarity(self, j1, j2, neutralScore=0, items=None):
0138        """See `Model.similarity` for argument information."""
0139        if items is None:
0140            items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0141        similarityCount = len([item for item in items if                                  self.judgeRatings[j1][item] > neutralScore and                                  self.judgeRatings[j2][item] > neutralScore])
0144        if similarityCount:
0145            return 1.0 / similarityCount
0146        else:
0147            return 2.0
0148
0149class SMSimilarityModel(Model):
0150    """Uses the constrained Pearson correlation function (Shardanand & Maes 1995)"""
0151    def similarity(self, j1, j2, neutralScore=0, items=None):
0152        """See `Model.similarity` for argument information."""
0153        if items is None:
0154            items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0155        top = bottomleft = bottomright = 0
0156        for item in items:
0157            top += (self.judgeRatings[j1][item] - neutralScore) *                      (self.judgeRatings[j2][item] - neutralScore)
0159            bottomleft += math.pow(self.judgeRatings[j1][item], 2)
0160            bottomright += math.pow(self.judgeRatings[j2][item], 2)
0161        if not bottomleft or not bottomright:
0162            return 1.0
0163        similarity = float(top) / math.sqrt(bottomleft * bottomright)
0164        return (2.0 - similarity)