itemCF 基於物品的協同過濾
代碼來自csdn分享的開源。
我加入一些輸出。得到矩陣的內容。
# -*- coding=utf-8 -*-
import math
import sys
import pandas
import numpy as np
from texttable import Texttable
from collections import defaultdict
# from Wtemp import *
from operator import itemgetter
# 讀取文件
def readFile(fileData):
data = []
rates = []
f = open(fileData, "r")
data = f.readlines()
f.close()
for line in data:
dataLine = line.split("\t")
rates.append([int(dataLine[0]), int(dataLine[1]), int(dataLine[2])])
return rates
# 創建字典,生成用戶評分的數據結構
# 輸入:數據集合,格式:用戶id\t硬盤id\t用戶評分
# 輸出:1.用戶字典:dic[用戶id]=[(電影id,電影評分)...]
# 2.電影字典:dic[電影id]=[用戶id1,用戶id2...]
def createDict(rates):
user_dict = {}
movie_dict = {}
for i in rates:
if i[0] in user_dict:
user_dict[i[0]].append((i[1], i[2]))
else:
user_dict[i[0]] = [(i[1], i[2])]
if i[1] in movie_dict:
movie_dict[i[1]].append(i[0])
else:
movie_dict[i[1]] = [i[0]]
user_txt=pandas.Series(user_dict)
user_txt.to_csv('user_txt.csv')
item_txt = pandas.Series(movie_dict)
item_txt.to_csv('movie_txt.csv')
return user_dict, movie_dict
# 建立物品倒排表,計算物品相似度
def itemCF(user_dict):
N = dict()
C = defaultdict(defaultdict)
W = defaultdict(defaultdict)
for key in user_dict:
for i in user_dict[key]:
if i[0] not in N.keys(): # i[0]表示movie_id
N[i[0]] = 0
N[i[0]] += 1 # N[i[0]]表示評論過某電影的用戶數
for j in user_dict[key]:
if i == j:
continue
if j not in C[i[0]].keys():
C[i[0]][j[0]] = 0
C[i[0]][j[0]] += 1 # C[i[0]][j[0]]表示電影兩兩之間的相似度,eg:同時評論過電影1和電影2的用戶數
for i, related_item in C.items():
for j, cij in related_item.items():
W[i][j] = cij / math.sqrt(N[i] * N[j])
data = [i,j,W[i][j]]
print >>f,(data)
return W
# 結合用戶喜好對物品排序
def recommondation(user_id, user_dict, K):
rank = defaultdict(int)
l = list()
W = itemCF(user_dict)
for i, score in user_dict[user_id]: # i爲特定用戶的電影id,score爲其相應評分
for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: # sorted()的返回值爲list,list的元素爲元組
if j in user_dict[user_id]:
continue
rank[j] += score * wj # 先找出用戶評論過的電影集合,對每一部電影id,假設其中一部電影id1,找出與該電影最相似的K部電影,計算出在id1下用戶對每部電影的興趣度,接着迭代整個用戶評論過的電影集合,求加權和,再排序,可推薦出前n部電影,我這裏取10部。
l = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:10]
return l
# 獲取電影列表
def getMovieList(item):
items = {}
f = open(item, "r")
movie_content = f.readlines()
f.close()
for movie in movie_content:
movieLine = movie.split("\t")
items[int(movieLine[0])] = movieLine[1:]
item_txt = pandas.Series(items)
item_txt.to_csv('item_txt.csv')
return items
# 主程序
if __name__ == '__main__':
itemTemp = getMovieList("D:/PycharmProjects/reMov/artists.item") # 獲取電影列表
fileTemp = readFile("D:/PycharmProjects/reMov/user_artists.data") # 讀取文件
user_dic, movie_dic = createDict(fileTemp) # 創建字典
#numpy.savetxt('user_dict.txt', user_dic)
print("創建字典")
user_id = 66
movieTemp = recommondation(user_id, user_dic, 80) # 對電影tuijian排序
movieTemp_txt = pandas.Series(movieTemp)
movieTemp_txt.to_csv('movieTemp_txt.csv')
print("創建字典2")
rows = []
table = Texttable() # 創建表格並顯示
table.set_deco(Texttable.HEADER)
table.set_cols_dtype(['t', 'f', 'a'])
table.set_cols_align(["l", "l", "l"])
rows.append(["user name", "recommondation_movie", "from userid"])
for i in movieTemp:
rows.append([user_id, itemTemp[i[0]][0], ""])
table.add_rows(rows)
print(table.draw())