ID3構造決策樹預測隱形眼鏡類型(代碼筆記)

決策樹可以從數據集合中提取出一系列規則,從而分類樣本。它的優勢是理解數據蘊含信息。

思想:利用信息增益(information gain)【度量數據集信息的方式—香農熵(entropy)】計算得出最好的劃分數據集的特徵,用遞歸的方式不斷找下一個最好的劃分特徵,直到遍歷完所有的屬性(特徵),或剩下實例的類別都相同時。返回的是用嵌套字典表示的樹結構(嵌套就是遞歸實現的)。

侷限:ID3無法直接處理數值型數據,一般應用對象是標稱型數據。

import operator
import numpy as np
'''
ID3決策樹預測隱形眼鏡類型
'''
#----------------------------- build the tree --------------------------
# 多數表決法, 對統計個數排序
def majorityCnt(classList): 
    classCount = {}
    for v in classList:
        classCount[v] = classCount.get(v,0)+1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# 計算香農熵
def calcShannonEnt(data):
    numExm = len(data)
    labelCounts = {}
    for featVec in data:
        curLabel = featVec[-1]
        labelCounts[curLabel] = labelCounts.get(curLabel,0)+1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/float(numExm)
        shannonEnt -= prob*np.log(prob,2)
    return shannonEnt

# 拆出數據集中屬性值feature[feat]爲val的部分,返回剩餘集(去掉該屬性)
def splitDataset(data, feat, val):
    remainData = []
    for featVec in data:
        if featVec[feat]==val:
            reducedFeatVec = featVec[:feat]
            reducedFeatVec.extend(featVec[feat+1:])
            remainData.append(reducedFeatVec)
    return remainData

# 找最好的劃分數據集的屬性, 作爲樹的結點
def chooseBestFeatureToSplit(data):
    baseEntropy = calcShannonEnt(data) # 全數據集的香農熵
    bestInfoGain = 0.0
    bestFeature = -1
    numFeat = len(data[0])-1 # 屬性個數,最後一列是類標籤
    for i in range(numFeat):  # 遍歷所有屬性
        featList = [exm[i] for exm in data]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for val in uniqueVals:  # 遍歷該屬性的所有可能值
            subData = splitDataset(data, i, val)
            prob = len(subDataset)/float(len(data))
            newEntropy += prob*calcShannonEnt(subData) # 某屬性值的香農熵
        infoGain = baseEntropy - newEntropy # 某屬性的信息增益
        if(infoGain>bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def createTree(data, labels):  # data最後一列是類標籤,labels是屬性名
    classList = [exm[-1] for exm in data]
    if classList.count(classList[0])==len(classList): # 所有實例類別都相同 (遞歸的終止條件1)
        return classList[0]
    if len(data[0])==1: # 每次遞歸減少一個屬性,最後一次時(已無屬性只有類別)返回出現次數最多的類名 (遞歸的終止條件2)
        return majorityCnt(classList) 
    bestFeat = chooseBestFeatureToSplit(data)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])  # 屬性名列表中去掉當前屬性
    featVal = [exm[bestFeat] for exm in data]
    uniqueVals = set(featVal) # 該屬性可能包含的所有值,1/2/3或1/2或其他; set類型具有元素唯一性
    for val in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][val] = createTree(splitDataset(data, bestFeat, val), subLabels)
    return myTree

def classify(myTree, featLab, testVec):
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    featIdx = featLab.index(firstStr) # 標籤字符串轉化爲索引
    for key in secondDict.keys():
        if testVec[featIdx] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key], featLab, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

#------------------------------ use the tree -----------------------------
def lensesTreePredict():
    fr = open('lenses.txt')
    lensesData = [inst.strip().split('\t') for inst in fr.readline()] # m行5列
    lenseLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] # 4個屬性
    lenseTree = createTree(lensesData, lenseLabels)
    【lenseTree存or取】
    # 預測或分類
    testVec = [1,2,1,2] 
    lenseClassPred = classify(lenseTree, lenseLabels, testVec) # 結果是1=hard,2=soft,3=no lense

構造決策樹很耗時,可以用Python的pickle模塊存儲字典對象。

def storeTree(myTree, fname):
    import pickle
    fw = open(fname,'w')
    pickle.dump(myTree, fw)
    fw.close

def grabTree(fname):
    import pickle
    fr = open(fname)
    return pickle.load(fr)

另外,構造決策樹的方法還有C4.5、CART算法等。

發佈了71 篇原創文章 · 獲贊 98 · 訪問量 42萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章