環(huán)境
python版本:3.5
數(shù)據(jù)來源
數(shù)據(jù)來自51CTO網(wǎng)站的分享,點(diǎn)此下載
關(guān)聯(lián)規(guī)則
所謂關(guān)聯(lián)規(guī)則,就是指現(xiàn)實中同時發(fā)生兩種不同事情之間的相關(guān)聯(lián)程度起意,具體分析可以參考這篇博客,講的很清晰
數(shù)據(jù)分析
這是數(shù)據(jù)文件
數(shù)據(jù)文件
其中movies中電影信息的內(nèi)容如圖所示
movies.dat
每行分別為電影id,電影名字最楷,電影類型,每項之間用::分隔待错,rating.dat為收集的用戶打分記錄籽孙,users.dat為用戶id對應(yīng)的用戶信息,personalRating.txt為個人打分火俄,用來找到規(guī)律后為個人推薦電影犯建,ratings.dat文件內(nèi)容如圖所示
ratings.dat
其中分別為用戶id,電影id瓜客,評分(1-5分)适瓦,評分時間,總共一百萬行多點(diǎn)的數(shù)據(jù)谱仪。我設(shè)置評分3分以上算是喜歡玻熙,最小支持度為0.2,最小置信度為0.5,下面是代碼實現(xiàn)
# -*- coding: utf-8 -*-
"""
Apriori exercise.
Created on Sun Oct 26 11:09:03 2017
@author: FWW
"""
import time
def createC1( dataSet ):
'''
構(gòu)建初始候選項集的列表疯攒,即所有候選項集只包含一個元素嗦随,
C1是大小為1的所有候選項集的集合
'''
C1 = []
for transaction in dataSet:
for item in transaction:
if [ item ] not in C1:
C1.append( [ item ] )
C1.sort()
return list(map( frozenset, C1 ))
def scanD( D, Ck, minSupport ):
'''
計算Ck中的項集在事務(wù)集合D的每個transactions中的支持度,
返回滿足最小支持度的項集的集合,和所有項集支持度信息的字典敬尺。
'''
ssCnt = {}
for tid in D:
# 對于每一條transaction
for can in Ck:
# 對于每一個候選項集can枚尼,檢查是否是transaction的一部分
# 即該候選can是否得到transaction的支持
if can.issubset( tid ):
ssCnt[ can ] = ssCnt.get( can, 0) + 1
numItems = float( len( D ) )
retList = []
supportData = {}
for key in ssCnt:
# 每個項集的支持度
support = ssCnt[ key ] / numItems
# 將滿足最小支持度的項集贴浙,加入retList
if support >= minSupport:
retList.insert( 0, key )
# 匯總支持度數(shù)據(jù)
supportData[ key ] = support
return retList, supportData
# Aprior算法
def aprioriGen( Lk, k ):
'''
由初始候選項集的集合Lk生成新的生成候選項集,
k表示生成的新項集中所含有的元素個數(shù)
'''
retList = []
lenLk = len( Lk )
for i in range( lenLk ):
for j in range( i + 1, lenLk ):
L1 = list( Lk[ i ] )[ : k - 2 ];
L2 = list( Lk[ j ] )[ : k - 2 ];
L1.sort();L2.sort()
if L1 == L2:
retList.append( Lk[ i ] | Lk[ j ] )
return retList
def apriori( dataSet, minSupport = 0.5 ):
# 構(gòu)建初始候選項集C1
C1 = createC1( dataSet )
# 將dataSet集合化姑原,以滿足scanD的格式要求
D = list(map( set, dataSet ))
# 構(gòu)建初始的頻繁項集悬而,即所有項集只有一個元素
L1, suppData = scanD( D, C1, minSupport )
L = [ L1 ]
# 最初的L1中的每個項集含有一個元素,新生成的
# 項集應(yīng)該含有2個元素锭汛,所以 k=2
k = 2
while ( len( L[ k - 2 ] ) > 0 ):
Ck = aprioriGen( L[ k - 2 ], k )
Lk, supK = scanD( D, Ck, minSupport )
# 將新的項集的支持度數(shù)據(jù)加入原來的總支持度字典中
suppData.update( supK )
# 將符合最小支持度要求的項集加入L
L.append( Lk )
# 新生成的項集中的元素個數(shù)應(yīng)不斷增加
k += 1
# 返回所有滿足條件的頻繁項集的列表笨奠,和所有候選項集的支持度信息
return L, suppData
def calcConf( freqSet, H, supportData, brl, minConf=0.5 ):
'''
計算規(guī)則的可信度,返回滿足最小可信度的規(guī)則唤殴。
freqSet(frozenset):頻繁項集
H(frozenset):頻繁項集中所有的元素
supportData(dic):頻繁項集中所有元素的支持度
brl(tuple):滿足可信度條件的關(guān)聯(lián)規(guī)則
minConf(float):最小可信度
'''
prunedH = []
for conseq in H:
conf = supportData[ freqSet ] / supportData[ freqSet - conseq ]
if conf >= minConf:
#print (freqSet - conseq, '-->', conseq, 'conf:', conf)
brl.append( ( freqSet - conseq, conseq, conf ) )
prunedH.append( conseq )
return prunedH
def rulesFromConseq( freqSet, H, supportData, brl, minConf=0.5 ):
'''
對頻繁項集中元素超過2的項集進(jìn)行合并般婆。
freqSet(frozenset):頻繁項集
H(frozenset):頻繁項集中的所有元素,即可以出現(xiàn)在規(guī)則右部的元素
supportData(dict):所有項集的支持度信息
brl(tuple):生成的規(guī)則
'''
m = len( H[ 0 ] )
if m == 1:
calcConf( freqSet, H , supportData, brl, minConf )
# 查看頻繁項集是否大到移除大小為 m 的子集
if len( freqSet ) > m + 1:
Hmp1 = aprioriGen( H, m + 1 )
Hmp1 = calcConf( freqSet, Hmp1, supportData, brl, minConf )
# 如果不止一條規(guī)則滿足要求朵逝,進(jìn)一步遞歸合并
if len( Hmp1 ) > 1:
rulesFromConseq( freqSet, Hmp1, supportData, brl, minConf )
def recommendMovies(rules,personal_list,movie_list):
recommend_list = []
sup_list = []
for rule in rules:
if rule[0] <= personal_list:
for movie in rule[1]:
if movie_list[movie-1] not in recommend_list:
recommend_list.append(movie_list[movie-1])
sup_list.append(rule[2])
for recommend in recommend_list:
i = recommend_list.index(recommend)
print('Recommend you to watch',recommend,',',round(sup_list[i]*100,2),'% people who is similar to you like it!')
def generateRules( L, supportData, minConf=0.5 ):
'''
根據(jù)頻繁項集和最小可信度生成規(guī)則蔚袍。
L(list):存儲頻繁項集
supportData(dict):存儲著所有項集(不僅僅是頻繁項集)的支持度
minConf(float):最小可信度
'''
bigRuleList = []
for i in range( 1, len( L ) ):
for freqSet in L[ i ]:
# 對于每一個頻繁項集的集合freqSet
H1 = [ frozenset( [ item ] ) for item in freqSet ]
# 如果頻繁項集中的元素個數(shù)大于2,需要進(jìn)一步合并
if i > 1:
rulesFromConseq( freqSet, H1, supportData, bigRuleList, minConf )
else:
calcConf( freqSet, H1, supportData, bigRuleList, minConf )
return bigRuleList
if __name__ == '__main__':
# 導(dǎo)入數(shù)據(jù)集
start_time = time.time()
file_object = open('ratings.dat')
movies_object = open('movies.dat')
personal_object = open('personalRatings.txt')
file_list = []
try:
all_the_text = file_object.read()
origin_list = (line.split('::') for line in all_the_text.split('\n'))
tem_list = []
for line in origin_list:
if len(file_list)<int(line[0]):
file_list.append(tem_list)
tem_list = []
else:
if int(line[2])>3:
tem_list.append(int(line[1]))
movies_text = movies_object.read()
movies_list = []
for item in (line.split('::') for line in movies_text.split('\n')):
if item[1] not in movies_list:
movies_list.append(item[1])
personal_text = personal_object.read()
personal_list = []
for item in (line.split('::') for line in personal_text.split('\n')):
if int(item[2])>3:
personal_list.append(int(item[1]))
finally:
file_object.close()
movies_object.close()
personal_object.close()
print('Read file sucess in',time.time()-start_time,'s')
# 選擇頻繁項集
L, suppData = apriori( file_list, 0.2 )
rules = generateRules( L, suppData, minConf=0.5 )
#print ('rules:\n', rules)
print ('Caculate rules success in',time.time()-start_time,'s')
recommendMovies(rules,frozenset(personal_list),movies_list)
print ('The program completes in',time.time()-start_time,'s')
運(yùn)行結(jié)果
2017-11-05 14-53-20屏幕截圖.png
讀文件用了1.3秒配名,運(yùn)行花了14秒啤咽,相信之后用numpy數(shù)組改進(jìn)一下運(yùn)行速度會更快