[Machine Learning From Scratch]-unsupervised_learning-fp_growth

from __future__ import division, print_function
import numpy as np
import itertools


class FPTreeNode():
    def __init__(self, item=None, support=1):
        # 'Value' of the item
        self.item = item
        # Number of times the item occurs in a
        # transaction
        self.support = support
        # Child nodes in the FP Growth Tree
        self.children = {}


class FPGrowth():
    """A method for determining frequent itemsets in a transactional database. 
    This is done by building a so called FP Growth tree, which can then be mined
    to collect the frequent itemsets. More effective than Apriori for large transactional
    databases.
    Parameters:
    -----------
    min_sup: float
        The minimum fraction of transactions an itemets needs to
        occur in to be deemed frequent
    """
    def __init__(self, min_sup=0.3):
        self.min_sup = min_sup
        # The root of the initial FP Growth Tree
        self.tree_root = None
        # Prefixes of itemsets in the FP Growth Tree
        self.prefixes = {}
        self.frequent_itemsets = []

    # Count the number of transactions that contains item.
    def _calculate_support(self, item, transactions):
        count = 0
        for transaction in transactions:
            if item in transaction:
                count += 1
        support = count
        return support


    def _get_frequent_items(self, transactions):
        """ Returns a set of frequent items. An item is determined to
        be frequent if there are atleast min_sup transactions that contains
        it. """
        # Get all unique items in the transactions
        unique_items = set(
            item for transaction in transactions for item in transaction)
        items = []
        for item in unique_items:
            sup = self._calculate_support(item, transactions)
            if sup >= self.min_sup:
                items.append([item, sup])
        # Sort by support - Highest to lowest
        items.sort(key=lambda item: item[1], reverse=True)
        frequent_items = [[el[0]] for el in items]
        # Only return the items
        return frequent_items

    def _insert_tree(self, node, children):
        """ Recursive method which adds nodes to the tree. """
        if not children:
            return
        # Create new node as the first item in children list
        child_item = children[0]
        child = FPTreeNode(item=child_item)
        # If parent already contains item => increase the support
        if child_item in node.children:
            node.children[child.item].support += 1
        else:
            node.children[child.item] = child

        # Execute _insert_tree on the rest of the children list
        # from the new node
        self._insert_tree(node.children[child.item], children[1:])

    def _construct_tree(self, transactions, frequent_items=None):
        if not frequent_items:
            # Get frequent items sorted by support
            frequent_items = self._get_frequent_items(transactions)
        unique_frequent_items = list(
            set(item for itemset in frequent_items for item in itemset))
        # Construct the root of the FP Growth tree
        root = FPTreeNode()
        for transaction in transactions:
            # Remove items that are not frequent according to
            # unique_frequent_items
            transaction = [item for item in transaction if item in unique_frequent_items]
            transaction.sort(key=lambda item: frequent_items.index([item]))
            self._insert_tree(root, transaction)

        return root

    def print_tree(self, node=None, indent_times=0):
        """ Recursive method which prints the FP Growth Tree """
        if not node:
            node = self.tree_root
        indent = "    " * indent_times
        print ("%s%s:%s" % (indent, node.item, node.support))
        for child_key in node.children:
            child = node.children[child_key]
            self.print_tree(child, indent_times + 1)


    def _is_prefix(self, itemset, node):
        """ Makes sure that the first item in itemset is a child of node 
        and that every following item in itemset is reachable via that path """
        for item in itemset:
            if not item in node.children:
                return False
            node = node.children[item]
        return True


    def _determine_prefixes(self, itemset, node, prefixes=None):
        """ Recursive method that adds prefixes to the itemset by traversing the 
        FP Growth Tree"""
        if not prefixes:
            prefixes = []

        # If the current node is a prefix to the itemset
        # add the current prefixes value as prefix to the itemset
        if self._is_prefix(itemset, node):
            itemset_key = self._get_itemset_key(itemset)
            if not itemset_key in self.prefixes:
                self.prefixes[itemset_key] = []
            self.prefixes[itemset_key] += [{"prefix": prefixes, "support": node.children[itemset[0]].support}]

        for child_key in node.children:
            child = node.children[child_key]
            # Recursive call with child as new node. Add the child item as potential
            # prefix.
            self._determine_prefixes(itemset, child, prefixes + [child.item])


    def _get_itemset_key(self, itemset):
        """ Determines the look of the hashmap key for self.prefixes
        List of more strings than one gets joined by '-' """
        if len(itemset) > 1:
            itemset_key = "-".join(itemset)
        else:
            itemset_key = str(itemset[0])
        return itemset_key

    def _determine_frequent_itemsets(self, conditional_database, suffix):
        # Calculate new frequent items from the conditional database
        # of suffix
        frequent_items = self._get_frequent_items(conditional_database)

        cond_tree = None

        if suffix:
            cond_tree = self._construct_tree(conditional_database, frequent_items)
            # Output new frequent itemset as the suffix added to the frequent
            # items
            self.frequent_itemsets += [el + suffix for el in frequent_items]

        # Find larger frequent itemset by finding prefixes
        # of the frequent items in the FP Growth Tree for the conditional
        # database.
        self.prefixes = {}
        for itemset in frequent_items:
            # If no suffix (first run)
            if not cond_tree:
                cond_tree = self.tree_root
            # Determine prefixes to itemset
            self._determine_prefixes(itemset, cond_tree)
            conditional_database = []
            itemset_key = self._get_itemset_key(itemset)
            # Build new conditional database
            if itemset_key in self.prefixes:
                for el in self.prefixes[itemset_key]:
                    # If support = 4 => add 4 of the corresponding prefix set
                    for _ in range(el["support"]):
                        conditional_database.append(el["prefix"])
                # Create new suffix
                new_suffix = itemset + suffix if suffix else itemset
                self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)

    def find_frequent_itemsets(self, transactions, suffix=None, show_tree=False):
        self.transactions = transactions

        # Build the FP Growth Tree
        self.tree_root = self._construct_tree(transactions)
        if show_tree:
            print ("FP-Growth Tree:")
            self.print_tree(self.tree_root)

        self._determine_frequent_itemsets(transactions, suffix=None)

        return self.frequent_itemsets
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末,一起剝皮案震驚了整個(gè)濱河市柿估,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌滓彰,老刑警劉巖解取,帶你破解...
    沈念sama閱讀 222,865評(píng)論 6 518
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場(chǎng)離奇詭異,居然都是意外死亡和蚪,警方通過查閱死者的電腦和手機(jī)双藕,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 95,296評(píng)論 3 399
  • 文/潘曉璐 我一進(jìn)店門淑趾,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人忧陪,你說我怎么就攤上這事扣泊。” “怎么了嘶摊?”我有些...
    開封第一講書人閱讀 169,631評(píng)論 0 364
  • 文/不壞的土叔 我叫張陵延蟹,是天一觀的道長。 經(jīng)常有香客問我叶堆,道長等孵,這世上最難降的妖魔是什么? 我笑而不...
    開封第一講書人閱讀 60,199評(píng)論 1 300
  • 正文 為了忘掉前任蹂空,我火速辦了婚禮俯萌,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘上枕。我一直安慰自己咐熙,他們只是感情好,可當(dāng)我...
    茶點(diǎn)故事閱讀 69,196評(píng)論 6 398
  • 文/花漫 我一把揭開白布辨萍。 她就那樣靜靜地躺著棋恼,像睡著了一般。 火紅的嫁衣襯著肌膚如雪锈玉。 梳的紋絲不亂的頭發(fā)上爪飘,一...
    開封第一講書人閱讀 52,793評(píng)論 1 314
  • 那天,我揣著相機(jī)與錄音拉背,去河邊找鬼师崎。 笑死,一個(gè)胖子當(dāng)著我的面吹牛椅棺,可吹牛的內(nèi)容都是我干的犁罩。 我是一名探鬼主播,決...
    沈念sama閱讀 41,221評(píng)論 3 423
  • 文/蒼蘭香墨 我猛地睜開眼两疚,長吁一口氣:“原來是場(chǎng)噩夢(mèng)啊……” “哼床估!你這毒婦竟也來了?” 一聲冷哼從身側(cè)響起诱渤,我...
    開封第一講書人閱讀 40,174評(píng)論 0 277
  • 序言:老撾萬榮一對(duì)情侶失蹤丐巫,失蹤者是張志新(化名)和其女友劉穎,沒想到半個(gè)月后,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體递胧,經(jīng)...
    沈念sama閱讀 46,699評(píng)論 1 320
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡碑韵,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 38,770評(píng)論 3 343
  • 正文 我和宋清朗相戀三年,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了谓着。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片泼诱。...
    茶點(diǎn)故事閱讀 40,918評(píng)論 1 353
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡,死狀恐怖赊锚,靈堂內(nèi)的尸體忽然破棺而出治筒,到底是詐尸還是另有隱情,我是刑警寧澤舷蒲,帶...
    沈念sama閱讀 36,573評(píng)論 5 351
  • 正文 年R本政府宣布耸袜,位于F島的核電站,受9級(jí)特大地震影響牲平,放射性物質(zhì)發(fā)生泄漏堤框。R本人自食惡果不足惜,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 42,255評(píng)論 3 336
  • 文/蒙蒙 一纵柿、第九天 我趴在偏房一處隱蔽的房頂上張望蜈抓。 院中可真熱鬧,春花似錦昂儒、人聲如沸沟使。這莊子的主人今日做“春日...
    開封第一講書人閱讀 32,749評(píng)論 0 25
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽腊嗡。三九已至,卻和暖如春拾酝,著一層夾襖步出監(jiān)牢的瞬間燕少,已是汗流浹背。 一陣腳步聲響...
    開封第一講書人閱讀 33,862評(píng)論 1 274
  • 我被黑心中介騙來泰國打工蒿囤, 沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留客们,地道東北人。 一個(gè)月前我還...
    沈念sama閱讀 49,364評(píng)論 3 379
  • 正文 我出身青樓蟋软,卻偏偏與公主長得像镶摘,于是被迫代替她去往敵國和親。 傳聞我的和親對(duì)象是個(gè)殘疾皇子岳守,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 45,926評(píng)論 2 361

推薦閱讀更多精彩內(nèi)容