apriori.py
# -*- coding: utf-8 -*-
import numpy
# 加載數據
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# 加載出數據集中每一個物品項單獨成一個集合
def creatC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)
# D是候選集,就是數據集,c1代表單物品項集合,minSupport代表最小支持度
def scanD(D,Ck,minSupport):
ssCnt = {}#ssCat用來存放鍵值對:鍵是單物品/二物品/三物品/...,值是所有數據集中包含該物品項的個數
for tid in D:
for can in Ck:
# issubset代表子集的意思
if can.issubset(tid):
# ssCnt.has_key(can)代表判斷ssCat中是否存在一個叫做can的鍵
if not ssCnt.has_key(can):ssCnt[can] = 1
else:ssCnt[can] += 1
numItems = float(len(D))
retList = [] #用來存放滿足最小支持度的集合
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems #計算支持度
if support >= minSupport: #如果支持度滿足最小支持度
retList.insert(0,key) #將相應的值放入retList
supportData[key] = support #同時將支持度放到supportData
return retList, supportData
D = loadDataSet()
C1 = creatC1(D)
L1, suppData0 = scanD(D,C1,0.5)
def aprioriGen(Lk,k):#creates CK
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
# 第一次調用的時候k-2=0,所以相當於L1與L2裏面均沒有元素了
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1==L2:
retList.append(Lk[i]|Lk[j])
return retList
# dataSet是數據集,minSupport是最小支持量
def apriori(dataSet, minSupport = 0.5):
C1 = creatC1(dataSet)
D = map(set, dataSet)
# 得到L1,和支持數據
L1, supportData = scanD(D, C1, minSupport)
# L用來存放L1,L2,L3...
L = [L1]
k = 2
while(len(L[k-2])>0):
Ck = aprioriGen(L[k-2], k)
# 下面的函數起過濾作用,過濾掉Ck中不滿足最小支持率的值
Lk, supk = scanD(D, Ck, minSupport)
supportData.update(supk)
L.append(Lk)
k += 1
return L, supportData
'''
以上部分生成了滿足最小支持度的頻繁項目集合
'''
'''
下面的內容是從頻繁項集中挖掘關聯規則
'''
# minConf爲最小可信度閾值,supportDate裏面存放了每一個頻繁項集的對應的支持度
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if(i>1): #當想生成的頻繁項集中包含2個元素以上時調用這個函數
rulesFromConseq(freqSet, H1, supportData, bigRuleList,
else: #當想生成的頻繁項集中只包含2個元素時直接調用這個函數計算可信度
calcConf(freqSet, H1, supportData, bigRuleList, minCon
return bigRuleList #生成一個包含可信度的規則列表
# 計算可信度值
def calcConf(freqSet, H, supportData, br1, minConf = 0.7):
prunedH = []
for conseq in H:
# freqSet-conseq是集合減去集合。即使freqSet中的元素減去conseq中的元素,而不是數減數
conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度
if conf >=minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
br1.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# 用於生成候選規則集合
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
m = len(H[0])
if(len(freqSet) > (m+1)):
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
if(len(Hmp1)>1):
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
test.py :在這裏面運行查看結果
# -*- coding: utf-8 -*-
import numpy
import apriori
dataSet = apriori.loadDataSet()
L,suppData = apriori.apriori(dataSet)
# print L
# print suppData #suppData是每一個項集的支持度
rules = apriori.generateRules(L, suppData, 0.5)
print rules
運行結果:
frozenset([3]) --> frozenset([1]) conf: 0.666666666667
frozenset([1]) --> frozenset([3]) conf: 1.0
frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([3]) --> frozenset([2]) conf: 0.666666666667
frozenset([2]) --> frozenset([3]) conf: 0.666666666667
frozenset([5]) --> frozenset([3]) conf: 0.666666666667
frozenset([3]) --> frozenset([5]) conf: 0.666666666667
frozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667
frozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667
frozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667
[(frozenset([3]), frozenset([1]), 0.6666666666666666), (frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0), (frozenset([3]), frozenset([2]), 0.6666666666666666), (frozenset([2]), frozenset([3]),
0.6666666666666666), (frozenset([5]), frozenset([3]), 0.6666666666666666), (frozenset([3]), frozenset([5]), 0.6666666666666666), (frozenset([5]), frozenset([2, 3]), 0.6666666666666666), (frozenset([3]), frozenset([2, 5]), 0.6666666666666666), (frozenset([2]),
frozenset([3, 5]), 0.6666666666666666)]
參考資料:【美】Peter Harrington.《Machine Learning in Action》