手動實現隨機森林並做數據實驗

獲取波士頓房價數據集

import numpy as np
from numpy import *
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
boston = load_boston()
boston.data.shape
(506, 13)
boston.target.shape
(506,)

搭建隨機森林

建立隨機森林類

import warnings 
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
class myrf:
    # 存放樹的列表
    trees = []
    # 隨機種子
    random_state = 0
    # 樹的個數
    n_estimators = 10
    # 最大特徵數
    max_features = 10
    # 最大深度
    max_depth = 10
    # 切分新節點所需的最小閾值
    min_change = 0.001
    # 當前樹的數量
    cur_tree = 0
    # 最小分割
    min_samples_split = 0
    # 葉子內節點的最小數目
    min_samples_leaf = 0
    # 每次建樹時所用的樣本佔總樣本的比例
    sample_radio = 0.9
    # 每次建樹時所並行化處理器的個數
    n_jobs = 10
    # 計算y的方差
    # 本來是要除總樣本數的,考慮到對於所有的葉子來說,總樣本數都是一致的,所以不除應該也可以。
    def get_varience(self, dataSet):
        return np.var(dataSet[:,-1])*shape(dataSet)[0]
    
    # 計算y的均值
    def get_mean(self,dataSet):
        return np.mean(dataSet[:,-1])
    
    # 根據特徵邊界劃分樣本
    def SplitDataSet(self, dataSet,feature,value):
        dataSet = dataSet[dataSet[:,feature].argsort()]
        for i in range(shape(dataSet)[0]):
            if dataSet[i][feature] == value and dataSet[i+1][feature] != value:
                return dataSet[i+1:, :], dataSet[0:i+1, :]
    
    # 選取特徵邊界
    def select_best_feature(self, dataSet):
        #計算特徵的數目
        feature_num=dataSet.shape[1]-1
        features=np.random.choice(feature_num,self.max_features,replace=False)
        # 最好分數
        bestS=inf;
        # 最優特徵
        bestfeature=0;
        # 最優特徵的分割值
        bestValue=0;
        S=self.get_varience(dataSet)
        # 判斷樣本數量是否足夠
        if shape(dataSet)[0] < self.min_samples_split or shape(dataSet)[0] < self.min_samples_leaf:
            return None,self.get_mean(dataSet)
        for feature in features:
            dataSet = dataSet[dataSet[:,feature].argsort()]
            # 控制葉子節點數目
            for index in range(shape(dataSet)[0]-1):
                # 排除重複值
                if index != shape(dataSet)[0]-1 and dataSet[index][feature] == dataSet[index+1][feature]:
                    continue
                data0 = dataSet[0:index+1, :]
                data1 = dataSet[index+1:, :]
                if shape(data0)[0] < self.min_samples_leaf or shape(data1)[0] < self.min_samples_leaf:
                    continue;
                newS=self.get_varience(data0)+self.get_varience(data1)
                if bestS>newS:
                    bestfeature=feature
                    bestValue=dataSet[index][feature]
#                     print(bestfeature, bestValue)
                    bestS=newS
        if (S-bestS)<self.min_change: #如果誤差不大就退出,說明無法分割
            return None,self.get_mean(dataSet)
#         print(bestfeature, bestValue)
        return bestfeature,bestValue
    
    # 搭建單顆決策樹
    def createTree(self, dataSet, max_level, flag = 0):
        if flag == 0:
            seqtree = self.cur_tree+1
            self.cur_tree = seqtree;
            print('正在搭建第',seqtree,'棵樹...')
        bestfeature,bestValue=self.select_best_feature(dataSet)
        if bestfeature==None:
            if flag == 0:
                print('第',seqtree,'棵樹搭建完成!')
            return bestValue
        retTree={}
        max_level-=1
        if max_level<0:   #控制深度
            return self.get_mean(dataSet)
        retTree['bestFeature']=bestfeature
        retTree['bestVal']=bestValue
        # 分割成左右兩棵樹
        lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
        retTree['right']=self.createTree(rSet,self.max_depth,1)
        retTree['left']=self.createTree(lSet,self.max_depth,1)
        if flag == 0:
            print('第',seqtree,'棵樹搭建完成!')
        return retTree
    # 搭建決策樹
    def createTree(self, dataSet, max_level, flag = 0):
        if flag == 0:
            seqtree = self.cur_tree+1
            self.cur_tree = seqtree;
            print('正在搭建第'+str(seqtree)+'棵樹...\n')
        bestfeature,bestValue=self.select_best_feature(dataSet)
        if bestfeature==None:
            if flag == 0:
                print('第'+str(seqtree)+'棵樹搭建完成!')
            return bestValue
        retTree={}
        max_level-=1
        if max_level<0:   #控制深度
            return self.get_mean(dataSet)
        retTree['bestFeature']=bestfeature
        retTree['bestVal']=bestValue
        # 分割成左右兩棵樹
        lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue)
        retTree['right']=self.createTree(rSet,self.max_depth,1)
        retTree['left']=self.createTree(lSet,self.max_depth,1)
        if flag == 0:
            print('第'+str(seqtree)+'棵樹搭建完成!')
        return retTree
    
    # 初始化隨機森林
    def __init__(self, random_state, n_estimators, max_features, max_depth, min_change = 0.001,
                 min_samples_split = 0, min_samples_leaf = 0, sample_radio = 0.9, n_jobs = 10):
        self.trees = []
        self.random_state = random_state
        np.random.seed(self.random_state)
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_change = min_change
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.sample_radio = sample_radio
        self.n_jobs = n_jobs
        
    # 向森林添加單棵決策樹
    def get_one_tree(self, dataSet):
        X_train, X_test, y_train, y_test = train_test_split(dataSet[:,:-1], dataSet[:,-1], 
                                                            train_size = self.sample_radio, random_state = self.random_state)
        X_train=np.concatenate((X_train,y_train.reshape((-1,1))),axis=1)
        self.trees.append(self.createTree(X_train,self.max_depth))
    
    # 並行化搭建隨機森林
    def fit(self, X, Y):   #樹的個數,預測時使用的特徵的數目,樹的深度
        dataSet = np.concatenate((X, Y.reshape(-1,1)), axis = -1)
        Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.get_one_tree)(dataSet) for _ in range(self.n_estimators))             
            
    #預測單個數據樣本
    def treeForecast(self,tree,data):
        if not isinstance(tree,dict):
            return float(tree)
        if data[tree['bestFeature']]>tree['bestVal']:
            if type(tree['left'])=='float':
                return tree['left']
            else:
                return self.treeForecast(tree['left'],data)
        else:
            if type(tree['right'])=='float':
                return tree['right']
            else:
                return self.treeForecast(tree['right'],data) 
            
    # 單決策樹預測結果
    def createForeCast(self,tree,dataSet):
        seqtree = self.cur_tree+1
        self.cur_tree = seqtree;
        print('第'+str(seqtree)+'棵樹正在預測...\n')
        l=len(dataSet)
        predict=np.mat(zeros((l,1)))
        for i in range(l):
            predict[i,0]=self.treeForecast(tree,dataSet[i,:])
        print('第'+str(seqtree)+'棵樹預測完成!')
        return predict
    
    # 更新預測值函數
    def unpdate_predict(self, predict, tree, X):
        predict+=self.createForeCast(tree,X)
    
    # 隨機森林預測結果
    def predict(self,X):
        self.cur_tree = 0;
        l=len(X)
        predict=np.mat(zeros((l,1)))
        Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.unpdate_predict)(predict, tree, X) for tree in self.trees)
    #     對多棵樹預測的結果取平均
        predict/=self.n_estimators
        return predict
    
    # 獲取模型分數
    def get_score(self,target, X):
        return r2_score(target, self.predict(X))

模型預測與評估

預測模型


# rf2 = mycache(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=10)
rf1 = myrf(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=-1)
rf1.fit(boston.data, boston.target)
正在搭建第1棵樹...

正在搭建第2棵樹...
正在搭建第3棵樹...
正在搭建第4棵樹...

第4棵樹搭建完成!
正在搭建第5棵樹...

第2棵樹搭建完成!第3棵樹搭建完成!

正在搭建第6棵樹...

正在搭建第7棵樹...

第1棵樹搭建完成!
正在搭建第8棵樹...

第6棵樹搭建完成!
正在搭建第9棵樹...

第5棵樹搭建完成!第7棵樹搭建完成!

第8棵樹搭建完成!
正在搭建第10棵樹...

第9棵樹搭建完成!
第10棵樹搭建完成!
rf1.get_score(boston.target, boston.data)
第1棵樹正在預測...
第2棵樹正在預測...
第3棵樹正在預測...

第1棵樹預測完成!
第4棵樹正在預測...
第3棵樹預測完成!第5棵樹正在預測...
第6棵樹正在預測...
第6棵樹預測完成!第2棵樹預測完成!
第7棵樹正在預測...
第7棵樹預測完成!
第8棵樹正在預測...
第8棵樹預測完成!
第5棵樹預測完成!
第9棵樹正在預測...
第4棵樹預測完成!第9棵樹預測完成!
第10棵樹正在預測...
第10棵樹預測完成!

0.9302502640348399
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章