決策樹（分類樹）算法

原創

2020-06-13 04:34

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz

# ID3算法 以信息增益爲準劃分屬性

'''
	函數功能： 計算信息熵
	:param dataSet 原始數據集
	:return ent 信息熵的值
'''


def calEnt(dataSet):
	n = dataSet.shape[0]  # 數據集總行數
	iset = dataSet.iloc[:, -1].value_counts()  # 標籤所有類別
	p = iset / n  # 每個類別所佔標籤比

	# -p * np.log2(p) 計算香農熵公式

	ent = (-p * np.log2(p)).sum()  # 計算信息熵（所有類別的香農熵之和）

	return ent


'''
	構建數據集
'''


def createDataSet():
	row_data = {
		'no surfacing': [1, 1, 1, 0, 0],
		'flippers': [1, 1, 0, 1, 1],
		'fish': ['yes', 'yes', 'no', 'no', 'no']
	}
	dataSet = pd.DataFrame(row_data)
	return dataSet


'''
	函數功能： 根據信息增益選擇出最佳數據集切分的列
	:param dataSet 原始數據集
	:return axis 數據集的最佳切分列的索引
'''


# 選擇最優的列進行切分
def bestSplit(dataSet):
	baseEnt = calEnt(dataSet)  # 計算原始的信息熵
	bestGain = 0  # 初始化信息增益
	axis = -1  # 初始化最佳切分列，標籤列  也就是從根節點開始切分
	for i in range(dataSet.shape[1] - 1):  # 對特徵的每一列進行循環
		leaves = dataSet.iloc[:, i].value_counts().index  # 提取出當前列的所有取值
		ents = 0  # 初始化子節點的信息熵
		for j in leaves:  # 對當前列的每一個取值進行循環
			childSet = dataSet[dataSet.iloc[:, i] == j]  # 某一個子節點的dataFrame
			ent = calEnt(childSet)  # 計算某個子節點的信息熵
			ents += (childSet.shape[0] / dataSet.shape[0]) * ent  # 計算當前列的信息熵
		infoGain = baseEnt - ents
		if (infoGain > bestGain):
			bestGain = infoGain
			axis = i
	return axis


'''
	按照給定列切分數據集
	:params dataSet 原始數據集
			axis 指定列索引
			value: 指定屬性值
	:return redataSet   按照指定列索引和屬性值切分後的數據集
'''


def mySplit(dataSet, axis, value):
	col = dataSet.columns[axis]
	redataSet = dataSet.loc[dataSet[col] == value, :].drop(col, axis=1)  # 移除已經使用過的特徵 axis=1->根據col按列刪除
	return redataSet


'''
	函數功能 遞歸構建決策樹
	:param dataSet
'''


def createTree(dataSet):
	featList = list(dataSet.columns)  # 提取出數據集的所有的列特徵
	classList = dataSet.iloc[:, -1].value_counts()  # 獲取最後一列類標籤
	# 判斷最多標籤數目是否等於數據集行數 或者數據集是否只有一列
	if classList[0] == dataSet.shape[0] or dataSet.shape[1] == 1:
		return classList.index[0]  # 如果是則返回類標籤 退出遞歸的條件
	axis = bestSplit(dataSet)  # 確定出當前最佳劃分列的索引
	bestFeat = featList[axis]  # 獲取該索引對應的特徵

	# 採用字典嵌套的方式存儲樹信息
	myTree = {
		bestFeat: {}
	}

	del featList[axis]  # 刪除當前特徵
	valueList = set(dataSet.iloc[:, axis])  # 提取最佳切分列所有屬性值
	for value in valueList:  # 對每一個屬性值遞歸建樹
		myTree[bestFeat][value] = createTree(mySplit(dataSet, axis, value))
	return myTree


'''
	函數功能 使用決策樹執行分類
	:params inputTree: 已經生成的決策樹
			labels: 存儲選擇的最優特徵標籤
			testVec: 測試數據列表，順序對應原數據集
	:return classLabel : 分類結果
'''
def classify(inputTree, labels, testVec):
	firstStr = next(iter(inputTree))  # 獲取決策樹的第一個節點
	secondDict = inputTree[firstStr]  # 下一個字典
	print(secondDict)
	featIndex = labels.index(firstStr)  # 第一個節點的列索引
	for key in secondDict.keys():
		if testVec[featIndex] == key:
			if type(secondDict[key]) == dict:
				classLabel = classify(secondDict[key], labels, testVec)
			else:
				classLabel = secondDict[key]
	return classLabel


'''
	函數功能：對測試集進行預測，並返回預測後的結果
	:params train: 訓練集
			test: 測試集
	:return test: 預測好分類的測試集
'''
def acc_classify(train, test):
	inputTree = createTree(train)
	labels = list(train.columns)
	result = []
	for i in range(test.shape[0]):
		testVec = test.iloc[i, : -1]
		classLabel = classify(inputTree, labels, testVec)
		result.append(classLabel)
	test['predic'] = result

	acc = (test.iloc[:,-1] == test.iloc[:, -2]).mean()
	print(f'模型預測準確率爲{acc}')
	return test

dataSet = createDataSet()
train = dataSet
test = dataSet.iloc[:3,:]
acc_classify(train, test)



'''
dataSet = createDataSet()

# 特徵
Xtrain = dataSet.iloc[:,:-1]
# 標籤
Ytrain = dataSet.iloc[:,-1]
labels = Ytrain.unique().tolist()
# 將文本轉化爲數字
Ytrain = Ytrain.apply(lambda  x : labels.index(x))

# 繪製樹模型
clf = DecisionTreeClassifier()
clf = clf.fit(Xtrain, Ytrain)
tree.export_graphviz(clf)
dot_data = tree.export_graphviz(clf, out_file=None)
graphviz.Source(dot_data)

# 繪製圖形增加標籤和顏色
dot_data = tree.export_graphviz(
	clf, out_file=None,
	feature_names=['no surfacing','flippers'],
	class_names=['fish','not fish'],
	filled=True,
	rounded=True,
	special_characters=True
)

graphviz.Source(dot_data)

# 利用render方法生成圖形
graph = graphviz.Source(dot_data)
graph.render("fish")
'''

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

決策樹（分類樹）算法

【簡寫Mybatis-02】註冊機的實現以及SqlSession處理

手繪二維碼

.NET藉助虛擬網卡實現一個簡單異地組網工具

nginx搭配Spring Cloud Gateway 搭建域名訪問環境

決策樹（分類樹）算法

基於 redis 的分佈式鎖

利用Docker部署Springboot項目 + vue+nginx+docker 的前端項目部署方案

樂鑫科技 2020 屆秋招-軟件類真題

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結