0001"""Models for prediction of user ratings of items using collaborative filtering.
0002
0003The classes `DistanceModel`, `SimpleSimilarityModel`, and `SMSimilarityModel`
0004use different methods for judging the similarity of two users given their
0005rating histories.
0006
0007The `getRecommendations` method for the above classes returns a list of
0008items and their scores (how relevant they are to the given user), sorted
0009from most recommended to least recommended. It does this by weighting other
0010users' preferences by their similarity to the user for whom recommendations
0011are being requested.
0012
0013The example below uses the AudioScrobbler service to obtain preference data,
0014then uses `getRecommendations` to suggest artists for a given user.
0015
0016Usage
0017-----
0018>>> from consensus import DistanceModel, SimpleSimilarityModel, SMSimilarityModel
0019>>> def loadPreferences():
0020... d = shelve.open("scrobbler.pickle")
0021... judgeRatings = {}
0022... for user, artistsInfo in d.iteritems():
0023... judgeRatings[user] = {}
0024... for artistName, playCount in artistsInfo:
0025... judgeRatings[user][artistName] = playCount
0026... return judgeRatings
0027
0028>>> judgeRatings = loadPreferences()
0029>>> distance = DistanceModel(judgeRatings)
0030>>> similarity = SimpleSimilarityModel(judgeRatings)
0031>>> sm_similarity = SMSimilarityModel(judgeRatings)
0032
0033>>> [artist for artist, score in distance.getRecommendations('eggs_again', limit=10)]
0034[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'Belle and Sebastian',
0035 u'The Beatles', u'Wilco', u'The Decemberists', u'Interpol', u'Bright Eyes', u'Beck']
0036
0037>>> [artist for artist, score in similarity.getRecommendations('eggs_again', limit=10)]
0038[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'The Beatles',
0039 u'Belle and Sebastian', u'Wilco', u'Interpol', u'The Decemberists', u'Beck',
0040 u'Bright Eyes']
0041
0042>>> [artist for artist, score in sm_similarity.getRecommendations('eggs_again', limit=10)]
0043[u'Sufjan Stevens', u'Elliott Smith', u'Broken Social Scene', u'Belle and Sebastian',
0044 u'The Beatles', u'Interpol', u'Wilco', u'The Decemberists', u'Bright Eyes',
0045 u'Death Cab for Cutie']
0046"""
0047import math
0048from operator import itemgetter
0049
0050class Model(object):
0051 """The base class for all collaborative filtering models.
0052 Not usable unles subclassed.
0053 """
0054 def __init__(self, judgeRatings=None, judges=None):
0055 """Initialize the model with a group of judges and their rating histories
0056
0057 `judgeRatings` is a dictionary mapping judges to their rating histories.
0058 Rating histories are represented as dictionaries mapping items to their
0059 rating by that judge. If a list `judges` is given as a list of judges,
0060 `judgeRatings` will be initialized with empty rating histories.
0061 """
0062 if judgeRatings:
0063 self.judgeRatings = judgeRatings
0064 elif judges:
0065 self.judgeRatings = dict((j, {}) for j in judges)
0066 else:
0067 self.judgeRatings = {}
0068
0069 def similarity(self, j1, j2, neutralScore=0, items=None):
0070 """Calculate the similarity of judges j1 and j2.
0071
0072 If `neutralScore` is given, it is the midpoint of the rating scale
0073 being used.
0074
0075 If a list of `items` is given, the judges' ratings for only those
0076 items will be considered when determining similarity.
0077 """
0078 raise NotImplementedError("Subclasses must implement a similarity function")
0079
0080 def getNeighbors(self, judge, limit=None, threshold=None, neutralScore=0,
0081 items=None):
0082 """Get a sorted list of neighbors for the given judge, where neighbors
0083 are defined as judges with preferences most similar to the given judge.
0084
0085 `judge` is the user for whom a list of neighbors will be calculated.
0086
0087 If `limit` is given, it will limit the length of the list of neighbors
0088 being returned. If `threshold` is given, it specifies the cutoff point
0089 for which neighbors will be included. This will be highly dependent on
0090 which model is chosen. A lower `threshold` will result in fewer
0091 neighbors being returned.
0092
0093 `neutralScore` and `items` are as defined in `Model.similarity`.
0094 """
0095 neighbors = []
0096 for j in self.judgeRatings:
0097 if j == judge:
0098 continue
0099 similarity = self.similarity(j, judge, neutralScore, items)
0100 if threshold and similarity > threshold:
0101 continue
0102 neighbors.append((j, similarity))
0103 return sorted(neighbors, key=itemgetter(1))[:limit]
0104
0105 def getRecommendations(self, judge, limit=None, neutralScore=0,
0106 items=None):
0107 """Return a list of items and their scores (how relevant they are to
0108 the given judge), sorted from most recommended to least recommended.
0109
0110 `limit`, `neutralScore`, and `items` are all as defined in
0111 `Model.getNeighbors`.
0112 """
0113 recommendScores = {}
0114 for j, ratings in self.judgeRatings.iteritems():
0115 for i, rating in ratings.iteritems():
0116 if rating > neutralScore and i not in self.judgeRatings[judge]:
0117 recommendScores.setdefault(i, 0)
0118 recommendScores[i] += rating / self.similarity(j, judge,
0119 neutralScore,
0120 items)
0121
0122 return sorted(((i, s) for (i, s) in recommendScores.iteritems()),
0123 key=itemgetter(1), reverse=True)[:limit]
0124
0125class DistanceModel(Model):
0126 """Uses the vector distance in n-dimensional space defined by the users' ratings for n items"""
0127 def similarity(self, j1, j2, neutralScore=0, items=None):
0128 """See `Model.similarity` for argument information."""
0129 if items is None:
0130 items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0131 return math.sqrt(sum(pow(self.judgeRatings[j1].get(i, neutralScore) - self.judgeRatings[j2].get(i, neutralScore),
0133 2) for i in items))
0134
0135class SimpleSimilarityModel(Model):
0136 """Uses a simple function where a larger intersection of preferred items results in greater similarity"""
0137 def similarity(self, j1, j2, neutralScore=0, items=None):
0138 """See `Model.similarity` for argument information."""
0139 if items is None:
0140 items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0141 similarityCount = len([item for item in items if self.judgeRatings[j1][item] > neutralScore and self.judgeRatings[j2][item] > neutralScore])
0144 if similarityCount:
0145 return 1.0 / similarityCount
0146 else:
0147 return 2.0
0148
0149class SMSimilarityModel(Model):
0150 """Uses the constrained Pearson correlation function (Shardanand & Maes 1995)"""
0151 def similarity(self, j1, j2, neutralScore=0, items=None):
0152 """See `Model.similarity` for argument information."""
0153 if items is None:
0154 items = set(self.judgeRatings[j1]) & set(self.judgeRatings[j2])
0155 top = bottomleft = bottomright = 0
0156 for item in items:
0157 top += (self.judgeRatings[j1][item] - neutralScore) * (self.judgeRatings[j2][item] - neutralScore)
0159 bottomleft += math.pow(self.judgeRatings[j1][item], 2)
0160 bottomright += math.pow(self.judgeRatings[j2][item], 2)
0161 if not bottomleft or not bottomright:
0162 return 1.0
0163 similarity = float(top) / math.sqrt(bottomleft * bottomright)
0164 return (2.0 - similarity)