FP-growth(頻繁模式增長)
- 數(shù)據(jù)庫的第一遍掃描用來統(tǒng)計出現(xiàn)的頻率;第二遍掃面中考慮那些頻繁元素
優(yōu)點:
缺點:
- 實現(xiàn)比較困難嚎京,在某些數(shù)據(jù)集上性能會下降
適用數(shù)據(jù)類型:
簡單數(shù)據(jù)集及數(shù)據(jù)包裝器
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
simpDat = loadSimpDat()
initSet = createInitSet(simpDat)
initSet
{frozenset({'h', 'j', 'p', 'r', 'z'}): 1,
frozenset({'s', 't', 'u', 'v', 'w', 'x', 'y', 'z'}): 1,
frozenset({'z'}): 1,
frozenset({'n', 'o', 'r', 's', 'x'}): 1,
frozenset({'p', 'q', 'r', 't', 'x', 'y', 'z'}): 1,
frozenset({'e', 'm', 'q', 's', 't', 'x', 'y', 'z'}): 1}
構(gòu)建FP樹
#節(jié)點數(shù)據(jù)結(jié)構(gòu)
class treeNode:
def __init__(self,nameValue,numOccur,parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode
self.children = {}
def inc(self,numOccur):
self.count += numOccur
def disp(self,ind=1):
print(' '*ind,self.name,' ',self.count)
for child in self.children.values(): #self.children.values()是一個節(jié)點
child.disp(ind+1)
rootNode = treeNode('pyramid',9,None) #創(chuàng)建一個單節(jié)點
rootNode.children['eye'] = treeNode('eye', 13, None)
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
rootNode.disp()
pyramid 9
eye 13
phoenix 3
#構(gòu)建FP樹
def createTree(dataSet, minSup=1):
headerTable = {}
#第一次掃描D
for trans in dataSet:#每個事務(wù)
for item in trans:#某個事務(wù)的每個元素
headerTable[item] = headerTable.get(item, 0) + dataSet[trans] #統(tǒng)計每個元素出現(xiàn)的頻率
headerTableCopy = headerTable.copy()
for k in headerTableCopy.keys(): #過濾
if headerTable[k] < minSup:
del(headerTable[k])
#單元素頻繁項集
freqItemSet = set(headerTable.keys())
# print ('freqItemSet: ',freqItemSet)#freqItemSet: {'y', 'z', 's', 't', 'x', 'r'}
if len(freqItemSet) == 0: return None, None #若所有項都不頻繁
for k in headerTable:
headerTable[k] = [headerTable[k], None] #格式化 headerTable
# print(headerTable)#{'z': [5, None], 'r': [3, None], 'y': [3, None], 's': [3, None], 't': [3, None], 'x': [4, None]}
retTree = treeNode('Null Set', 1, None) #創(chuàng)建根節(jié)點
#第二次掃描D
for tranSet, count in dataSet.items(): #遍歷每一個事務(wù)
localD = {}
for item in tranSet: #遍歷每一個元素
if item in freqItemSet:#若元素是頻繁的
localD[item] = headerTable[item][0] #記錄個數(shù)
# print(localD)#{'z': 5, 'r': 3}
if len(localD) > 0:#一個事務(wù)中叮盘,頻繁項至少有一個秧廉,則增長分支
#先排序怒竿,方便增長分支,排序之后的頭指針表
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
# print(orderedItems)#[1'z', 'r']
#增長分支
updateTree(orderedItems, retTree, headerTable, count)#增長
return retTree, headerTable
#增長分支
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:#若第一個元素是子節(jié)點,inTree.children:dict
inTree.children[items[0]].inc(count) #計數(shù)+1
else:
inTree.children[items[0]] = treeNode(items[0], count, inTree)#創(chuàng)建分支
if headerTable[items[0]][1] == None: #若指針為空
headerTable[items[0]][1] = inTree.children[items[0]]#指針指向本節(jié)點
else:#若指針已經(jīng)有指向湃望,則再更新,在末尾添加一個指向
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#items[1::]:刪除第一個元素痰驱,繼續(xù)創(chuàng)建分支
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
#在末尾添加指針
def updateHeader(nodeToTest, targetNode):
while (nodeToTest.nodeLink != None): #沿著nodelink到達鏈表末尾
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode #添加下一個指向
myFPtree,myHeaderTab = createTree(initSet, minSup=3)
myFPtree.disp()
Null Set 1
z 5
r 1
x 3
y 2
s 2
t 2
r 1
y 1
t 1
x 1
r 1
s 1
myHeaderTab
{'r': [3, <__main__.treeNode at 0x7f5cfc1e1e80>],
'z': [5, <__main__.treeNode at 0x7f5cfc1a5c88>],
'x': [4, <__main__.treeNode at 0x7f5cfc1e1e48>],
'y': [3, <__main__.treeNode at 0x7f5cfc1e1eb8>],
's': [3, <__main__.treeNode at 0x7f5cfc1e1da0>],
't': [3, <__main__.treeNode at 0x7f5cfc1e1fd0>]}
發(fā)現(xiàn)以給定元素結(jié)尾的所有路徑的函數(shù)
#上溯一條路徑
def ascendTree(leafNode, prefixPath):
if leafNode.parent != None:#若父節(jié)點存在
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)#繼續(xù)向上
#找到給定元素的所有前綴路徑
def findPrefixPath(basePat, treeNode):
condPats = {}
while treeNode != None: #若節(jié)點存在
prefixPath = []
ascendTree(treeNode, prefixPath) #上溯路徑
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink #跳到下一個指向的位置
return condPats
findPrefixPath('x', myHeaderTab['x'][1])
{frozenset({'z'}): 3}
myHeaderTab
{'r': [3, <__main__.treeNode at 0x7f5cfc1e1e80>],
'z': [5, <__main__.treeNode at 0x7f5cfc1a5c88>],
'x': [4, <__main__.treeNode at 0x7f5cfc1e1e48>],
'y': [3, <__main__.treeNode at 0x7f5cfc1e1eb8>],
's': [3, <__main__.treeNode at 0x7f5cfc1e1da0>],
't': [3, <__main__.treeNode at 0x7f5cfc1e1fd0>]}
遞歸查找頻繁項集的 mineTree 函數(shù)
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
#對頻繁項排序,頻繁數(shù)從小到大
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[0])]
# print(bigL)#['r', 's', 't', 'x', 'y', 'z']
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy()#每換一次元素证芭,都初始化一次
newFreqSet.add(basePat)
# print ('finalFrequent Item: ',newFreqSet)
freqItemList.append(newFreqSet)
# print(freqItemList)
#1. 找到條件模式基
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
# print ('condPattBases :',basePat, condPattBases)
#2. 構(gòu)建條件FP樹
myCondTree, myHead = createTree(condPattBases, minSup)
# print ('head from conditional tree: ', myHead)
if myHead != None: #3. mine cond. FP-tree
# print('conditional tree for: ',newFreqSet)
# myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
myFPtree.disp()
Null Set 1
z 5
r 1
x 3
y 2
s 2
t 2
r 1
y 1
t 1
x 1
r 1
s 1
freqItemList = []
mineTree(myFPtree, headerTable=myHeaderTab, minSup=3, preFix=set([]), freqItemList=freqItemList)
print(freqItemList)
[{'r'}, {'s'}, {'s', 'x'}, {'t'}, {'t', 'x'}, {'t', 'y', 'x'}, {'t', 'y'}, {'t', 'z'}, {'t', 'z', 'x'}, {'y', 't', 'z', 'x'}, {'y', 't', 'z'}, {'x'}, {'z', 'x'}, {'y'}, {'y', 'x'}, {'y', 'x', 'z'}, {'y', 'z'}, {'z'}]
示例:從新聞網(wǎng)站點擊流中挖掘
parseDat = [line.split() for line in open('../../Reference Code/Ch12/kosarak.dat').readlines()]
parseDat
[['1', '2', '3'],
['1'],
['4', '5', '6', '7'],
['1', '8'],
['9', '10'],
['11', '6', '12', '13', '14', '15', '16'],
['1', '3', '7'],
['17', '18'],
['11', '6', '19', '20', '21', '22', '23', '24'],
['1', '25', '3'],
['26', '3'],
['11',
'27',
'6',
'3',
'28',
'7',
'29',
'30',
'31',
'32',
'33',
'34',
'35',
'36',
'37'],
['6', '2', '38'],
['39',
'11',
'27',
'1',
'40',
'6',
'41',
'42',
'43',
'44',
'45',
'46',
'47',
'3',
'48',
'7',
'49',
'50',
'51'],
['52', '6', '3', '53'],
['54', '1', '6', '55'],
['11', '6', '56', '57', '58', '59', '60', '61', '62', '63', '64'],
['3'],
['1', '65', '66', '67', '68', '3'],
['69', '11', '1', '6'],
['11', '70', '6'],
['6', '3', '71'],
['72', '6', '73'],
['74'],
['75', '76'],
['6', '3', '77'],
['78', '79', '80', '81'],
['82', '6', '83', '7', '84', '85', '86', '87', '88'],
['11',
'27',
'1',
'6',
'89',
'90',
'91',
'92',
'93',
'14',
'94',
'95',
'96',
'97',
'98',
'99',
'100',
'101',
'102',
'103',
'104',
'105',
'106',
'107',
'108',
'109',
'110',
'111',
'112',
'113',
'114',
'115',
'116',
'117',
'118',
'119',
'120',
'121',
'122',
'123',
'124',
'125',
'126',
'127',
'128',
'129',
'130',
'131',
'132',
'133',
'64',
'134',
'135',
'136',
'137'],
['6', '138'],
對初始集合格式化,構(gòu)建FP樹萄唇,尋找至少被10w人瀏覽過的新聞報道
initSet = createInitSet(parseDat)
myFPtree, myHeaderTab = createTree(initSet, 100000)
myFPtree.disp()
Null Set 1
3 76514
1 12917
1 16829
6 412762
11 261773
3 117401
1 34141
1 43366
3 68888
1 13436
1 16461
11 21190
3 9718
1 1565
1 1882
myFreqList = []
mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList)
myFreqList
[{'1'},
{'1', '6'},
{'11'},
{'11', '6'},
{'3'},
{'11', '3'},
{'11', '3', '6'},
{'3', '6'},
{'6'}]