案例一、估計房屋價格

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score   #導入評估方法
from sklearn.utils import shuffle   #打亂順序
import matplotlib.pyplot as plt

#加載數據
house_data = datasets.load_boston()
# print(house_data.feature_names)

#打亂數據,random_state來控制打亂的順序
x, y = shuffle(house_data.data, house_data.target, random_state=7)  

#劃分訓練集和測試集
num_training = int(0.8*len(x))
x_train, y_train = x[:num_training], y[:num_training]
x_test, y_test = x[num_training:], y[num_training:]

#採用決策樹迴歸模型
dt_regression = DecisionTreeRegressor(max_depth=4)
dt_regression.fit(x_train, y_train)

#採用adaboost迴歸模型
ab_regression = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7)
ab_regression.fit(x_train, y_train)

#決策樹訓練效果
y_pre_dt = dt_regression.predict(x_test)
mse = mean_squared_error(y_test, y_pre_dt)     #均方誤差,公式是(預測-測試)平方再開方
evs = explained_variance_score(y_test, y_pre_dt)  #解釋方差得分
print("\n###Decision tree performance###")
print("Mean squared error = ", round(mse, 2))
print("Explain variance score = ", round(evs, 2))

#adaBoost訓練效果
y_pre_ab = ab_regression.predict(x_test)
mse = mean_squared_error(y_test, y_pre_ab)
evs = explained_variance_score(y_test, y_pre_ab)
print("##AbBoost tree performance##")
print("Mean squared error = ", round(mse, 2))  
print("Explain variance score = ", round(evs, 2))


def plot_feature_importances(feature_importances, title, feature_names):
    '''
    #畫出特徵的相對重要性
    :param feature_importances:
    :param title:
    :param feature_names:
    :return:
    '''
    #將重要性值標準化
    feature_importances = 100.0* (feature_importances / max(feature_importances))    
    
    #將得分從高到低排序
    index_sorted = np.flipud(np.argsort(feature_importances))
    
    #讓x座標軸上的標籤居中
    pos = np.arange(index_sorted.shape[0])+0.5

    #畫條形圖
    plt.figure()
    plt.bar(pos, feature_importances[index_sorted], align='center')
    plt.xticks(pos, feature_names[index_sorted])
    plt.ylabel('Relative Importance')
    plt.title(title)
    plt.show()


plot_feature_importances(dt_regression.feature_importances_, 'Decission Tree  Regressor', house_data.feature_names)

plot_feature_importances(ab_regression.feature_importances_, 'AdaBoost regression', house_data.feature_names)

結果:

###Decision tree performance###
Mean squared error =  14.79
Explain variance score =  0.82
##AbBoost tree performance##
Mean squared error =  7.64

Explain variance score =  0.91

圖形:



解釋:採用AdaBoost算法和決策樹做對比,可以看出用AdaBoost結果更好。迴歸決策樹的最重要特徵是RM,而AdaBoost最重要特徵是LSTAT

涉及內容:

(1)AdaBoost算法:

點擊打開鏈接

(2)metric

點擊打開鏈接

(3)round(原數字,位數)函數作用是保留原數字小數點後幾位

(4)feature_importances_屬性是得出每個特徵的佔比

print(dt_regression.feature_importances_)
得到:[ 0.03856422  0.          0.          0.          0.02908806  0.62280298
  0.          0.10473383  0.          0.          0.00460542  0.

  0.20020548]


 下面代碼:目的是讓每個特徵在100範圍內展示

feature_importances = 100.0* (feature_importances / max(feature_importances))   

(5)

 index_sorted = np.flipud(np.argsort(feature_importances))

flipud():作用是讓矩陣上下翻轉

import numpy as np

a = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]
m = np.array(a)
#翻轉90度,k是翻轉的次數
mt = np.rot90(m, k=1)
print(mt)
#左右翻轉
mr = np.fliplr(m)
print(mr)
#上下翻轉
md = np.flipud(m)
print(md)
argsort():作用是從小到大排列

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章