python實現(xiàn)協(xié)同過濾算法
實現(xiàn)基于MapReduce協(xié)同過濾谭网,需要三個階段快集,如下所示
第一個MapReduce:通過ui矩陣得到歸一化后的ui矩陣
map階段:以i為key進行分區(qū)排序纪岁,相同的key的hash編碼放到同一個partition中,
#!/usr/local/bin/python
import sys
for line in sys.stdin:
u, i, s = line.strip().split(',')
print "%s\t%s\t%s" % (i, u, s)
reduce階段:利用同一個i被所用用戶打過分的score,對其進行歸一化操作
#!/usr/local/bin/python
import sys
import math
cur_item = None
user_score_list = []
for line in sys.stdin:
item, user, score = line.strip().split('\t')
if cur_item == None:
cur_item = item
if item != cur_item:
sum = 0.0
for tuple in user_score_list:
(u, s) = tuple
sum += pow(s,2)
sum = math.sqrt(sum)
for tuple in user_score_list:
(u, s) = tuple
print "%s\t%s\t%\t" % (u,cur_item,float(s/sum))
user_score_list = []
cur_item = item
user_score_list.append((user,float(score))
for tuple in user_score_list:
(u, s) = tuple
sum += pow(s, 2)
sum = math.sqrt(sum)
for tuple in user_score_list:
(u, s) = tuple
print "%s\t%s\t%s" % (u, cur_item, float(s / sum))
第二個MapReduce:
map階段:為了得到ii 矩陣必須以u為key杠步,得到(u,i,s)
#!/usr/local/bin/python
import sys
for line in sys.stdin:
u, i, s = line.strip().split('\t')
print "%s\t%s\t%s" % (u, i, s)
reduce階段:對同一個用戶孩锡,計算所有打過分的item之間歸一化后的分數(shù)的乘積鸟顺,得到 ii 矩陣
#!/usr/local/bin/python
import sys
cur_user = None
item_score_list = []
for line in sys.stdin:
user, item, score = line.strip().split(\t')
if cur_user == None:
cur_user = user
if cur_user != user:
for i in range(0,len(item_score_list) -1):
for j in range(i +1,len(item_score_list)):
item_a, score_a = item_score_list[i]
item_b, score_b = item_score_list[j]
print "%s\t%s\t%s" % (item_a, item_b, score_a * score_b)
print "%s\t%s\t%s" % (item_b, item_a, score_a * score_b)
item_score_list = []
cur_user = user
item_user_score.append((item, float(score)))
for i in range(0, len(item_score_list) - 1):
for j in range(i + 1, len(item_score_list)):
item_a, score_a = item_score_list[i]
item_b, score_b = item_score_list[j]
print "%s\t%s\t%s" % (item_a, item_b, score_a * score_b)
print "%s\t%s\t%s" % (item_b, item_a, score_a * score_b)
第三個MapReduce:
map階段:以item_a_item_b key惦蚊,調(diào)用map函數(shù)
#!/usr/local/bin/python
import sys
for line in sys.stdin:
item_a, item_b, s = line.strip().split('\t')
print "%s\t%s" % (item_a +"_" + item_b, s)
reduce階段: 對相同的key進行聚合,對value值score進行求和,就得到item與item之間的相似度
#!/usr/local/bin/python
import sys
cur_ii_pair = None
score = 0.0
for line in sys.stdin:
ii_pair, s = line.strip().split('\t')
if cur_ii_pair == None:
cur_ii_pair = ii_pair
if cur_ii_pair != ii_pair:
item_a, item_b = cur_ii_pair.split('_')
print "%s\t%s\t%s" % (item_a, item_b, sum)
cur_ii_pair = ii_pair
score = 0.0
score += float(s)
item_a, item_b = cur_ii_pair.split('_')
print "%s\t%s\t%s" % (item_a, item_b, sum)