【Machine Learning in Action】使用Apriori算法進行關聯分析

apriori.py

# -*- coding: utf-8 -*-                                               
import numpy                                                          
# 加載數據                                                                
def loadDataSet():                                                    
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]               
# 加載出數據集中每一個物品項單獨成一個集合                                                
def creatC1(dataSet):                                                 
    C1 = []                                                           
    for transaction in dataSet:                                       
        for item in transaction:                                      
            if not [item] in C1:                                      
                C1.append([item])                                     
    C1.sort()                                                         
    return map(frozenset, C1)                                         
# D是候選集,就是數據集,c1代表單物品項集合,minSupport代表最小支持度                            
def scanD(D,Ck,minSupport):                                           
    ssCnt = {}#ssCat用來存放鍵值對:鍵是單物品/二物品/三物品/...,值是所有數據集中包含該物品項的個數       
    for tid in D:                                                     
        for can in Ck:                                                
            # issubset代表子集的意思                                         
            if can.issubset(tid):                                     
                # ssCnt.has_key(can)代表判斷ssCat中是否存在一個叫做can的鍵           
                if not ssCnt.has_key(can):ssCnt[can] = 1              
                else:ssCnt[can] += 1                                  
    numItems = float(len(D))                                          
    retList = []  #用來存放滿足最小支持度的集合                                     
    supportData = {}                                                  
    for key in ssCnt:                                                 
        support = ssCnt[key]/numItems   #計算支持度                        
        if support >= minSupport:   #如果支持度滿足最小支持度                     
            retList.insert(0,key)  #將相應的值放入retList                    
        supportData[key] = support  #同時將支持度放到supportData              
    return retList, supportData                                       
D = loadDataSet()                                                     
C1 = creatC1(D)                                                       
L1, suppData0 = scanD(D,C1,0.5)                                       
def aprioriGen(Lk,k):#creates CK                                      
    retList = []                                                      
    lenLk = len(Lk)                                                   
    for i in range(lenLk):                                            
        for j in range(i+1, lenLk):                                   
            # 第一次調用的時候k-2=0,所以相當於L1與L2裏面均沒有元素了                        
            L1 = list(Lk[i])[:k-2]                                    
            L2 = list(Lk[j])[:k-2]                                    
            L1.sort()                                                 
            L2.sort()                                                 
            if L1==L2:                                                
                retList.append(Lk[i]|Lk[j])                           
    return retList                                                    
# dataSet是數據集,minSupport是最小支持量                                        
def  apriori(dataSet, minSupport = 0.5):                              
    C1 = creatC1(dataSet)                                             
    D = map(set, dataSet)                                             
    # 得到L1,和支持數據                                                      
    L1, supportData = scanD(D, C1, minSupport)                        
    # L用來存放L1,L2,L3...                                                
    L = [L1]                                                          
    k = 2                                                             
    while(len(L[k-2])>0):                                             
        Ck = aprioriGen(L[k-2], k)                                    
        # 下面的函數起過濾作用,過濾掉Ck中不滿足最小支持率的值                                 
        Lk, supk = scanD(D, Ck, minSupport)                           
        supportData.update(supk)                                      
        L.append(Lk)                                                  
        k += 1                                                        
    return L, supportData                                             
                                                                      
                                                                      
                                                                      
'''                                                                   
以上部分生成了滿足最小支持度的頻繁項目集合                                                 
'''                                                                   
                                                                      
                                                                      
'''                                                                   
下面的內容是從頻繁項集中挖掘關聯規則                                                    
'''                                                                   
# minConf爲最小可信度閾值,supportDate裏面存放了每一個頻繁項集的對應的支持度                      
def generateRules(L, supportData, minConf=0.7):                       
    bigRuleList = []                                                  
    for i in range(1, len(L)):                                        
        for freqSet in L[i]:                                          
            H1 = [frozenset([item]) for item in freqSet]              
            if(i>1):     #當想生成的頻繁項集中包含2個元素以上時調用這個函數                   
                rulesFromConseq(freqSet, H1, supportData, bigRuleList,
            else:   #當想生成的頻繁項集中只包含2個元素時直接調用這個函數計算可信度                  
                calcConf(freqSet, H1, supportData, bigRuleList, minCon
    return bigRuleList   #生成一個包含可信度的規則列表                              
# 計算可信度值                                                              
def calcConf(freqSet, H, supportData, br1, minConf = 0.7):            
    prunedH = []                                                      
    for conseq in H:                                                  
        # freqSet-conseq是集合減去集合。即使freqSet中的元素減去conseq中的元素,而不是數減數      
        conf = supportData[freqSet]/supportData[freqSet-conseq]   #可信度
        if conf >=minConf:                                            
            print freqSet-conseq, '-->', conseq, 'conf:', conf        
            br1.append((freqSet-conseq, conseq, conf))                
            prunedH.append(conseq)                                    
    return prunedH                                                    
# 用於生成候選規則集合                                                          
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):       
    m = len(H[0])                                                     
    if(len(freqSet) > (m+1)):                                         
        Hmp1 = aprioriGen(H, m+1)                                     
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)     
        if(len(Hmp1)>1):                                              
            rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf) 



test.py :在這裏面運行查看結果

# -*- coding: utf-8 -*-
import numpy
import apriori

dataSet = apriori.loadDataSet()
L,suppData = apriori.apriori(dataSet)
# print L
# print suppData  #suppData是每一個項集的支持度
rules = apriori.generateRules(L, suppData, 0.5)
print rules


運行結果:

frozenset([3]) --> frozenset([1]) conf: 0.666666666667
frozenset([1]) --> frozenset([3]) conf: 1.0
frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([3]) --> frozenset([2]) conf: 0.666666666667
frozenset([2]) --> frozenset([3]) conf: 0.666666666667
frozenset([5]) --> frozenset([3]) conf: 0.666666666667
frozenset([3]) --> frozenset([5]) conf: 0.666666666667
frozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667
frozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667
frozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667
[(frozenset([3]), frozenset([1]), 0.6666666666666666), (frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0), (frozenset([3]), frozenset([2]), 0.6666666666666666), (frozenset([2]), frozenset([3]), 0.6666666666666666), (frozenset([5]), frozenset([3]), 0.6666666666666666), (frozenset([3]), frozenset([5]), 0.6666666666666666), (frozenset([5]), frozenset([2, 3]), 0.6666666666666666), (frozenset([3]), frozenset([2, 5]), 0.6666666666666666), (frozenset([2]), frozenset([3, 5]), 0.6666666666666666)]



參考資料:【美】Peter Harrington.《Machine Learning in Action》

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章