【參考文獻】:Sarwar B M . Item-based collaborative filtering recommendation algorithms[C]// International Conference on World Wide Web. ACM, 2001.
背景:推薦領域必讀文獻之一,經典之作,本博客主要記錄了該文章的主要思想和相關實現代碼,歡迎觀摩!
前提或假設
- 用戶對項目的評分值,能夠反應用戶對項目某種程度上的偏好。
- 用戶過去的偏好很可能展示或者反應未來的興趣偏好。
數據集
我們選用MovieLens 100K Dataset,=> 100,000 ratings from 1000 users on 1700 movies.
下載地址:movielens數據集
算法理論
算法框架:如圖,輸入是user-item的評分矩陣,該矩陣非常稀疏。算法的任務是預測特定用戶對特定項目的評分,填補矩陣中空白單元格,接着根據預測評分從高到低爲特定用戶進行top-N推薦
算法預測:算法認爲某用戶喜歡某項目,在很大程度上也會對和該項目較相似的項目產生興趣。所以預測分兩步進行:計算項目之間的相似性和根據相似性進行預測評分。
文章提供了三個相似性計算公式:
Cosine-based Similarity
$$\[sim(i,j)= cos(\vec{i},\vec{j})= \frac{\vec{i}\cdot \vec{j}}{\left \| \vec{i} \right \|_{2}*\left \| \vec{j} \right \|_{2}}\]$$
Correlation-based Similarity
$$\[sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})(R_{u,j}-\bar{R}_{j})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{j})^{2}}}\]$$
Adjusted Cosine Similarity
$$\[sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})(R_{u,j}-\bar{R}_{u})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{u})^{2}}}\]$$
但是所有的相似性計算公式必須在共同評分項上進行,即同時評價過i和j的歷史評分
算法選取和該項目最相似的前N個項目作爲預測基礎,預測公式如下:
$$\[P_{u,i}=\frac{\sum _{all similar items,N}(S_{i,N}*R_{u,N})}{\sum _{all similar items,N}(\left | S_{i,N} \right |)}\]$$
算法最後一步,根據預測評分值從高到低進行推薦
實驗度量
文章採用MAE進行誤差度量,公式如下:
$$\[MAE = \frac{\sum_{i=1}^{N}\left | p_{i}-q_{i} \right |}{N}\]$$
Python 代碼
# !usr/bin/python
# -*- coding=utf-8 -*-
import math
import operator
#加載數據
def loadData():
# trainSet格式爲: testSet格式一致
# {
# userid:{
# itemid1: rating,
# itemid2: rating
# }
# }
# movieUser格式爲:看過某一部電影的所有用戶集合
# {
# itemid: {
# userid1: rating,
# userid2: rating
# }
# }
#
#
#
trainSet = {}
testSet = {}
movieUser = {}
TrainFile = './dataset/u1.base' # 指定訓練集
TestFile = './dataset/u1.test' # 指定測試集
# 讀取訓練集
f = open(TrainFile,'r')
lines = f.readlines()
for line in lines:
arr = line.strip().split('\t')
userId = arr[0]
itemId = arr[1]
rating = arr[2]
trainSet.setdefault(userId, {})
trainSet[userId].setdefault(itemId, float(rating))
movieUser.setdefault(itemId, {})
movieUser[itemId].setdefault(userId, float(rating))
# 讀取測試集
f1 = open(TestFile,'r')
lines1 = f1.readlines()
for line1 in lines1:
arr1 = line1.strip().split('\t')
userId1 = arr1[0]
itemId1 = arr1[1]
rating1 = arr1[2]
testSet.setdefault(userId1, {})
testSet[userId1].setdefault(itemId1, float(rating1))
arr = [trainSet,movieUser]
return arr
# 生成電影電影共有用戶矩陣
def i_j_users(i_id,j_id,movieUser):
# ij_users格式爲:
# {
# (i_id,j_id):{userid1:None,userid2:None,....}
# }
if i_id in movieUser.keys():
i_users = movieUser[i_id]
else:
i_users = {}
if j_id in movieUser.keys():
j_users = movieUser[j_id]
else:
j_users = {}
inter = dict.fromkeys([x for x in i_users if x in j_users])
i_j_users = {(i_id,j_id):inter}
return i_j_users
#計算一個用戶的平均分數
def getAverageRating(trainSet,userid):
average = (sum(trainSet[userid].values()) * 1.0) / len(trainSet[userid].keys())
return average
#計算項目相似度
def getItemSim(i_j_users,i_id,j_id,trainSet):
# 分子 sumtop
# 分母 sumbot1 sumbot2
sumtop = 0
sumbot1 = 0
sumbot2 = 0
ij_users = i_j_users[(i_id,j_id)]
if not ij_users:
ij_sim = -9999 # 疑問? 爲0 或者爲None
else:
for user in ij_users.keys():
avr_user = getAverageRating(trainSet,user)
# 求分子
left = trainSet[user][i_id] - avr_user
right = trainSet[user][j_id] - avr_user
sumtop += left*right
# 求分母
sumbot1 += left*left
sumbot2 += right*right
if sumbot1 == 0 or sumbot2 == 0:
ij_sim = 1
else:
ij_sim = sumtop*1.0 / (math.sqrt(sumbot1)*math.sqrt(sumbot2))
return ij_sim
# 計算項目i和其她所有項目的相似度並排序
# i_allitem_sim格式爲:
# {
# j_id1:s1,
# j_id2:s2
# }
def i_allitem_sort(i_id,movieUser,trainSet,N):
i_allitem = {}
for j in movieUser.keys():
if j != i_id:
i_j_user = i_j_users(i_id,j,movieUser)
s = getItemSim(i_j_user,i_id,j,trainSet)
i_allitem.setdefault(j, s)
i_allitem_sort1 = sorted(i_allitem.items(), key = operator.itemgetter(1), reverse = True)[0:N]
i_allitem_sort_dict = {}
for n in range(len(i_allitem_sort1)):
j1 = i_allitem_sort1[n][0]
s = i_allitem_sort1[n][1]
i_allitem_sort_dict.setdefault(j1, s)
return i_allitem_sort_dict
# 預測評分
def prediction(userid,itemid,moviUser,trainSet,N):
# predict 格式爲:
# {
# (userid,itemid): pui
# }
predict = 0
sumtop = 0
sumbot = 0
nsets = i_allitem_sort(itemid,movieUser,trainSet,N)
for j in nsets.keys():
# 防止用戶對i的領域集合內的j沒評分
if j not in trainSet[userid].keys():
ruj = 0
mid = 0
else:
ruj = trainSet[userid][j]
mid = abs(nsets[j])
sumtop += nsets[j]*ruj
sumbot += mid
# 防止分母爲0
if sumbot == 0:
predict = 0
else:
predict = sumtop * 1.0 / sumbot
return predict
def saveFile(moviUser,trainSet,N):
# 讀取用戶
string = ""
# 正在讀取
f = open("../Collaborative Filtering/dataset/u1.test")
fw = open("../Collaborative Filtering/predict",'w')
fl = f.readlines()
for i in fl:
arr = i.split('\t')
uid = str(arr[0].strip())
item = str(arr[1].strip())
rating = float(arr[2].strip())
predictScore = prediction(str(uid),str(item),moviUser,trainSet,N)
string = string + str(uid) + "\t" + str(item) + "\t" + str(rating) + "\t" + str(predictScore) + "\n"
fw.write(string)
f.close()
fw.close()
# 計算預測分析準確度
def getMAE():
f = open("../Collaborative Filtering/predict")
fl = f.readlines()
mae = 0.0
s = 0
counttest = 0# 測試集的個數
for i in fl:
arr = i.split('\t')
uid = str(arr[0].strip())
item = str(arr[1].strip())
rating = float(arr[2].strip())
predictScore = float(arr[3].strip())
if predictScore == 0:
mid = 0
else:
mid = abs((predictScore-rating))
counttest = counttest + 1
s = s + mid
mae = s/counttest
print(mae)
if __name__ == '__main__':
N = 30
arr = loadData()
trainSet = arr[0]
movieUser = arr[1]
saveFile(movieUser,trainSet,N)
# getMAE()