import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score #導入評估方法
from sklearn.utils import shuffle #打亂順序
import matplotlib.pyplot as plt
#加載數據
house_data = datasets.load_boston()
# print(house_data.feature_names)
#打亂數據,random_state來控制打亂的順序
x, y = shuffle(house_data.data, house_data.target, random_state=7)
#劃分訓練集和測試集
num_training = int(0.8*len(x))
x_train, y_train = x[:num_training], y[:num_training]
x_test, y_test = x[num_training:], y[num_training:]
#採用決策樹迴歸模型
dt_regression = DecisionTreeRegressor(max_depth=4)
dt_regression.fit(x_train, y_train)
#採用adaboost迴歸模型
ab_regression = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7)
ab_regression.fit(x_train, y_train)
#決策樹訓練效果
y_pre_dt = dt_regression.predict(x_test)
mse = mean_squared_error(y_test, y_pre_dt) #均方誤差,公式是(預測-測試)平方再開方
evs = explained_variance_score(y_test, y_pre_dt) #解釋方差得分
print("\n###Decision tree performance###")
print("Mean squared error = ", round(mse, 2))
print("Explain variance score = ", round(evs, 2))
#adaBoost訓練效果
y_pre_ab = ab_regression.predict(x_test)
mse = mean_squared_error(y_test, y_pre_ab)
evs = explained_variance_score(y_test, y_pre_ab)
print("##AbBoost tree performance##")
print("Mean squared error = ", round(mse, 2))
print("Explain variance score = ", round(evs, 2))
def plot_feature_importances(feature_importances, title, feature_names):
'''
#畫出特徵的相對重要性
:param feature_importances:
:param title:
:param feature_names:
:return:
'''
#將重要性值標準化
feature_importances = 100.0* (feature_importances / max(feature_importances))
#將得分從高到低排序
index_sorted = np.flipud(np.argsort(feature_importances))
#讓x座標軸上的標籤居中
pos = np.arange(index_sorted.shape[0])+0.5
#畫條形圖
plt.figure()
plt.bar(pos, feature_importances[index_sorted], align='center')
plt.xticks(pos, feature_names[index_sorted])
plt.ylabel('Relative Importance')
plt.title(title)
plt.show()
plot_feature_importances(dt_regression.feature_importances_, 'Decission Tree Regressor', house_data.feature_names)
plot_feature_importances(ab_regression.feature_importances_, 'AdaBoost regression', house_data.feature_names)
結果:
###Decision tree performance###
Mean squared error = 14.79
Explain variance score = 0.82
##AbBoost tree performance##
Mean squared error = 7.64
Explain variance score = 0.91
圖形:
解釋:採用AdaBoost算法和決策樹做對比,可以看出用AdaBoost結果更好。迴歸決策樹的最重要特徵是RM,而AdaBoost最重要特徵是LSTAT
涉及內容:
(1)AdaBoost算法:
(2)metric
(3)round(原數字,位數)函數作用是保留原數字小數點後幾位
(4)feature_importances_屬性是得出每個特徵的佔比
print(dt_regression.feature_importances_)
得到:[ 0.03856422 0. 0. 0. 0.02908806 0.622802980. 0.10473383 0. 0. 0.00460542 0.
0.20020548]
下面代碼:目的是讓每個特徵在100範圍內展示
feature_importances = 100.0* (feature_importances / max(feature_importances))
(5)
index_sorted = np.flipud(np.argsort(feature_importances))
flipud():作用是讓矩陣上下翻轉
import numpy as np a = [ [1,2,3], [4,5,6], [7,8,9] ] m = np.array(a) #翻轉90度,k是翻轉的次數 mt = np.rot90(m, k=1) print(mt) #左右翻轉 mr = np.fliplr(m) print(mr) #上下翻轉 md = np.flipud(m) print(md)argsort():作用是從小到大排列