決策樹可以從數據集合中提取出一系列規則,從而分類樣本。它的優勢是理解數據蘊含信息。
思想:利用信息增益(information gain)【度量數據集信息的方式—香農熵(entropy)】計算得出最好的劃分數據集的特徵,用遞歸的方式不斷找下一個最好的劃分特徵,直到遍歷完所有的屬性(特徵),或剩下實例的類別都相同時。返回的是用嵌套字典表示的樹結構(嵌套就是遞歸實現的)。
侷限:ID3無法直接處理數值型數據,一般應用對象是標稱型數據。
import operator
import numpy as np
'''
ID3決策樹預測隱形眼鏡類型
'''
#----------------------------- build the tree --------------------------
# 多數表決法, 對統計個數排序
def majorityCnt(classList):
classCount = {}
for v in classList:
classCount[v] = classCount.get(v,0)+1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# 計算香農熵
def calcShannonEnt(data):
numExm = len(data)
labelCounts = {}
for featVec in data:
curLabel = featVec[-1]
labelCounts[curLabel] = labelCounts.get(curLabel,0)+1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/float(numExm)
shannonEnt -= prob*np.log(prob,2)
return shannonEnt
# 拆出數據集中屬性值feature[feat]爲val的部分,返回剩餘集(去掉該屬性)
def splitDataset(data, feat, val):
remainData = []
for featVec in data:
if featVec[feat]==val:
reducedFeatVec = featVec[:feat]
reducedFeatVec.extend(featVec[feat+1:])
remainData.append(reducedFeatVec)
return remainData
# 找最好的劃分數據集的屬性, 作爲樹的結點
def chooseBestFeatureToSplit(data):
baseEntropy = calcShannonEnt(data) # 全數據集的香農熵
bestInfoGain = 0.0
bestFeature = -1
numFeat = len(data[0])-1 # 屬性個數,最後一列是類標籤
for i in range(numFeat): # 遍歷所有屬性
featList = [exm[i] for exm in data]
uniqueVals = set(featList)
newEntropy = 0.0
for val in uniqueVals: # 遍歷該屬性的所有可能值
subData = splitDataset(data, i, val)
prob = len(subDataset)/float(len(data))
newEntropy += prob*calcShannonEnt(subData) # 某屬性值的香農熵
infoGain = baseEntropy - newEntropy # 某屬性的信息增益
if(infoGain>bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def createTree(data, labels): # data最後一列是類標籤,labels是屬性名
classList = [exm[-1] for exm in data]
if classList.count(classList[0])==len(classList): # 所有實例類別都相同 (遞歸的終止條件1)
return classList[0]
if len(data[0])==1: # 每次遞歸減少一個屬性,最後一次時(已無屬性只有類別)返回出現次數最多的類名 (遞歸的終止條件2)
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(data)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat]) # 屬性名列表中去掉當前屬性
featVal = [exm[bestFeat] for exm in data]
uniqueVals = set(featVal) # 該屬性可能包含的所有值,1/2/3或1/2或其他; set類型具有元素唯一性
for val in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][val] = createTree(splitDataset(data, bestFeat, val), subLabels)
return myTree
def classify(myTree, featLab, testVec):
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
featIdx = featLab.index(firstStr) # 標籤字符串轉化爲索引
for key in secondDict.keys():
if testVec[featIdx] == key:
if type(secondDict[key]).__name__=='dict':
classLabel = classify(secondDict[key], featLab, testVec)
else:
classLabel = secondDict[key]
return classLabel
#------------------------------ use the tree -----------------------------
def lensesTreePredict():
fr = open('lenses.txt')
lensesData = [inst.strip().split('\t') for inst in fr.readline()] # m行5列
lenseLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] # 4個屬性
lenseTree = createTree(lensesData, lenseLabels)
【lenseTree存or取】
# 預測或分類
testVec = [1,2,1,2]
lenseClassPred = classify(lenseTree, lenseLabels, testVec) # 結果是1=hard,2=soft,3=no lense
構造決策樹很耗時,可以用Python的pickle模塊存儲字典對象。
def storeTree(myTree, fname):
import pickle
fw = open(fname,'w')
pickle.dump(myTree, fw)
fw.close
def grabTree(fname):
import pickle
fr = open(fname)
return pickle.load(fr)
另外,構造決策樹的方法還有C4.5、CART算法等。