獲取波士頓房價數據集
import numpy as np
from numpy import *
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
boston = load_boston()
boston.data.shape
(506, 13)
boston.target.shape
(506,)
搭建隨機森林
建立隨機森林類
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
class myrf:
# 存放樹的列表
trees = []
# 隨機種子
random_state = 0
# 樹的個數
n_estimators = 10
# 最大特徵數
max_features = 10
# 最大深度
max_depth = 10
# 切分新節點所需的最小閾值
min_change = 0.001
# 當前樹的數量
cur_tree = 0
# 最小分割
min_samples_split = 0
# 葉子內節點的最小數目
min_samples_leaf = 0
# 每次建樹時所用的樣本佔總樣本的比例
sample_radio = 0.9
# 每次建樹時所並行化處理器的個數
n_jobs = 10
# 計算y的方差
# 本來是要除總樣本數的,考慮到對於所有的葉子來說,總樣本數都是一致的,所以不除應該也可以。
def get_varience(self, dataSet):
return np.var(dataSet[:,-1])*shape(dataSet)[0]
# 計算y的均值
def get_mean(self,dataSet):
return np.mean(dataSet[:,-1])
# 根據特徵邊界劃分樣本
def SplitDataSet(self, dataSet,feature,value):
dataSet = dataSet[dataSet[:,feature].argsort()]
for i in range(shape(dataSet)[0]):
if dataSet[i][feature] == value and dataSet[i+1][feature] != value:
return dataSet[i+1:, :], dataSet[0:i+1, :]
# 選取特徵邊界
def select_best_feature(self, dataSet):
#計算特徵的數目
feature_num=dataSet.shape[1]-1
features=np.random.choice(feature_num,self.max_features,replace=False)
# 最好分數
bestS=inf;
# 最優特徵
bestfeature=0;
# 最優特徵的分割值
bestValue=0;
S=self.get_varience(dataSet)
# 判斷樣本數量是否足夠
if shape(dataSet)[0] < self.min_samples_split or shape(dataSet)[0] < self.min_samples_leaf:
return None,self.get_mean(dataSet)
for feature in features:
dataSet = dataSet[dataSet[:,feature].argsort()]
# 控制葉子節點數目
for index in range(shape(dataSet)[0]-1):
# 排除重複值
if index != shape(dataSet)[0]-1 and dataSet[index][feature] == dataSet[index+1][feature]:
continue
data0 = dataSet[0:index+1, :]
data1 = dataSet[index+1:, :]
if shape(data0)[0] < self.min_samples_leaf or shape(data1)[0] < self.min_samples_leaf:
continue;
newS=self.get_varience(data0)+self.get_varience(data1)
if bestS>newS:
bestfeature=feature
bestValue=dataSet[index][feature]
# print(bestfeature, bestValue)
bestS=newS
if (S-bestS)<self.min_change: #如果誤差不大就退出,說明無法分割
return None,self.get_mean(dataSet)
# print(bestfeature, bestValue)
return bestfeature,bestValue
# 搭建單顆決策樹
def createTree(self, dataSet, max_level, flag = 0):
if flag == 0:
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('正在搭建第',seqtree,'棵樹...')
bestfeature,bestValue=self.select_best_feature(dataSet)
if bestfeature==None:
if flag == 0:
print('第',seqtree,'棵樹搭建完成!')
return bestValue
retTree={}
max_level-=1
if max_level<0: #控制深度
return self.get_mean(dataSet)
retTree['bestFeature']=bestfeature
retTree['bestVal']=bestValue
# 分割成左右兩棵樹
lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
retTree['right']=self.createTree(rSet,self.max_depth,1)
retTree['left']=self.createTree(lSet,self.max_depth,1)
if flag == 0:
print('第',seqtree,'棵樹搭建完成!')
return retTree
# 搭建決策樹
def createTree(self, dataSet, max_level, flag = 0):
if flag == 0:
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('正在搭建第'+str(seqtree)+'棵樹...\n')
bestfeature,bestValue=self.select_best_feature(dataSet)
if bestfeature==None:
if flag == 0:
print('第'+str(seqtree)+'棵樹搭建完成!')
return bestValue
retTree={}
max_level-=1
if max_level<0: #控制深度
return self.get_mean(dataSet)
retTree['bestFeature']=bestfeature
retTree['bestVal']=bestValue
# 分割成左右兩棵樹
lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
retTree['right']=self.createTree(rSet,self.max_depth,1)
retTree['left']=self.createTree(lSet,self.max_depth,1)
if flag == 0:
print('第'+str(seqtree)+'棵樹搭建完成!')
return retTree
# 初始化隨機森林
def __init__(self, random_state, n_estimators, max_features, max_depth, min_change = 0.001,
min_samples_split = 0, min_samples_leaf = 0, sample_radio = 0.9, n_jobs = 10):
self.trees = []
self.random_state = random_state
np.random.seed(self.random_state)
self.n_estimators = n_estimators
self.max_features = max_features
self.max_depth = max_depth
self.min_change = min_change
self.min_samples_leaf = min_samples_leaf
self.min_samples_split = min_samples_split
self.sample_radio = sample_radio
self.n_jobs = n_jobs
# 向森林添加單棵決策樹
def get_one_tree(self, dataSet):
X_train, X_test, y_train, y_test = train_test_split(dataSet[:,:-1], dataSet[:,-1],
train_size = self.sample_radio, random_state = self.random_state)
X_train=np.concatenate((X_train,y_train.reshape((-1,1))),axis=1)
self.trees.append(self.createTree(X_train,self.max_depth))
# 並行化搭建隨機森林
def fit(self, X, Y): #樹的個數,預測時使用的特徵的數目,樹的深度
dataSet = np.concatenate((X, Y.reshape(-1,1)), axis = -1)
Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.get_one_tree)(dataSet) for _ in range(self.n_estimators))
#預測單個數據樣本
def treeForecast(self,tree,data):
if not isinstance(tree,dict):
return float(tree)
if data[tree['bestFeature']]>tree['bestVal']:
if type(tree['left'])=='float':
return tree['left']
else:
return self.treeForecast(tree['left'],data)
else:
if type(tree['right'])=='float':
return tree['right']
else:
return self.treeForecast(tree['right'],data)
# 單決策樹預測結果
def createForeCast(self,tree,dataSet):
seqtree = self.cur_tree+1
self.cur_tree = seqtree;
print('第'+str(seqtree)+'棵樹正在預測...\n')
l=len(dataSet)
predict=np.mat(zeros((l,1)))
for i in range(l):
predict[i,0]=self.treeForecast(tree,dataSet[i,:])
print('第'+str(seqtree)+'棵樹預測完成!')
return predict
# 更新預測值函數
def unpdate_predict(self, predict, tree, X):
predict+=self.createForeCast(tree,X)
# 隨機森林預測結果
def predict(self,X):
self.cur_tree = 0;
l=len(X)
predict=np.mat(zeros((l,1)))
Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.unpdate_predict)(predict, tree, X) for tree in self.trees)
# 對多棵樹預測的結果取平均
predict/=self.n_estimators
return predict
# 獲取模型分數
def get_score(self,target, X):
return r2_score(target, self.predict(X))
模型預測與評估
預測模型
# rf2 = mycache(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=10)
rf1 = myrf(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=-1)
rf1.fit(boston.data, boston.target)
正在搭建第1棵樹...
正在搭建第2棵樹...
正在搭建第3棵樹...
正在搭建第4棵樹...
第4棵樹搭建完成!
正在搭建第5棵樹...
第2棵樹搭建完成!第3棵樹搭建完成!
正在搭建第6棵樹...
正在搭建第7棵樹...
第1棵樹搭建完成!
正在搭建第8棵樹...
第6棵樹搭建完成!
正在搭建第9棵樹...
第5棵樹搭建完成!第7棵樹搭建完成!
第8棵樹搭建完成!
正在搭建第10棵樹...
第9棵樹搭建完成!
第10棵樹搭建完成!
rf1.get_score(boston.target, boston.data)
第1棵樹正在預測...
第2棵樹正在預測...
第3棵樹正在預測...
第1棵樹預測完成!
第4棵樹正在預測...
第3棵樹預測完成!第5棵樹正在預測...
第6棵樹正在預測...
第6棵樹預測完成!第2棵樹預測完成!
第7棵樹正在預測...
第7棵樹預測完成!
第8棵樹正在預測...
第8棵樹預測完成!
第5棵樹預測完成!
第9棵樹正在預測...
第4棵樹預測完成!第9棵樹預測完成!
第10棵樹正在預測...
第10棵樹預測完成!
0.9302502640348399