XGBoost實戰: 保險賠償預測

一. 查看數據

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train = pd.read_csv(r'F:\51學習\study\數據挖掘案例\Xgboost調參\Xgboost\train.csv')
test = pd.read_csv(r'F:\51學習\study\數據挖掘案例\Xgboost調參\Xgboost\test.csv')
print(train.shape)
print(test.shape)
train.head()

(188318, 132)
(125546, 131)

1.1 離散值

cat_features = list(train.select_dtypes(include = 'object').columns)
print('Categorical: {} features.'.format(len(cat_features)))

cont_features = [cont for cont in list(train.select_dtypes(
                include = ['float64', 'int64']).columns) if cont not in ['loss', 'id']]
print('Continuous: {} features.'.format(len(cont_features)))

id_col = list(train.select_dtypes(include = 'int64').columns)
print('A column of int64: {}.'.format(id_col))

# 類別值中屬性的個數
categorical_uniques = []
for cat in cat_features:
    categorical_uniques.append(len(train[cat].unique()))
uniq_values_in_categories = pd.DataFrame.from_dict({'cat_name':cat_features, 'unique_values':categorical_uniques})
uniq_values_in_categories.head()

Categorical: 116 features.
Continuous: 14 features.
A column of int64: [‘id’].

plt.style.use('seaborn-darkgrid')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 5))

ax1.hist(uniq_values_in_categories.unique_values, bins = 50)
ax1.set_title('Amount of categorical features with X distinct values')
ax1.set_xlabel('Distinct values in a feature')
ax1.set_ylabel('Features')
ax1.annotate('A feature with 326 vals', xy=(322, 2), xytext=(200, 38), arrowprops=dict(facecolor='black'))

ax2.hist(uniq_values_in_categories[uniq_values_in_categories.unique_values <= 30].unique_values, bins = 30)
ax2.set_xlim(2, 30)
ax2.set_title('Zooming in the [0,30] part of left histogram')
ax2.set_xlabel('Distinct values in a feature')
ax2.set_ylabel('Features')
ax2.annotate('Binary features', xy = (3, 71), xytext = (7, 71), arrowprops = dict(facecolor = 'black'))

1.2 目標值

plt.figure(figsize = (12, 6))
plt.plot(train['id'], train['loss'])
plt.title('Loss values per id')
plt.xlabel('Id')
plt.ylabel('loss')

數據時傾斜的, 可使用np.log改善傾斜度:

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(14, 5)

ax1.hist(train['loss'], bins = 50)
ax1.set_title('Train Loss target histogram')

ax2.hist(np.log(train['loss']), bins = 50, color = 'g')
ax2.set_title('Train Log Loss target histogram')

1.3 連續值

train[cont_features].hist(bins = 50, figsize = (16, 12))

# 特徵相關性

plt.figure(figsize = (16, 9))
correlation_mat = train[cont_features].corr()
sns.heatmap(correlation_mat, annot = True, cmap = 'summer_r')

1.4 數據預處理

train['log_loss'] = np.log(train['loss'])
features = [x for x in train.columns if x not in ['id', 'loss', 'log_loss']]
cat_features = list(train.select_dtypes(include = 'object').columns)
num_features = [x for x in train.select_dtypes(include = ['float64', 'int64']).columns if x not in ['id', 'loss', 'log_loss']]

ntrain = train.shape[0]
train_x = train[features]
train_y = train['log_loss']

for c in range(len(cat_features)):
    train_x.loc[:,cat_features[c]] = train_x.loc[:,cat_features[c]].astype('category').cat.codes
    
print('Xtrain:', train_x.shape)
print('ytrain:', train_y.shape)

Xtrain: (188318, 130)
ytrain: (188318,)

二. XGBoost基本模型

我們訓練一個基本的xgboost模型，然後進行參數調節通過交叉驗證來觀察結果的變換，使用平均絕對誤差來衡量

mean_absolute_error(np.exp(y), np.exp(yhat))

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

def xgb_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))

dtrain = xgb.DMatrix(train_x, train['log_loss'])

xgb_params = {'eat': 0.1, 'colsample_bytree': 0.5, 'subsample': 0.5,  'max_depth': 5,
            'silent': 0, 'seed': 42, 'objective': 'reg:linear', 'min_chile_weight': 3}

# 使用交叉驗證 xgb.cv
bst_cv = xgb.cv(xgb_params, dtrain, num_boost_round = 50, nfold = 3, seed = 42,
               feval = xgb_eval_mae, maximize = False, early_stopping_rounds = 10)
print('CV score:', bst_cv.iloc[-1, :]['test-mae-mean'])

CV score: 1178.3618979999999

bst_cv[['train-mae-mean', 'test-mae-mean']].plot(figsize = (8, 6))

%%time
#建立100個樹模型
bst_cv2 = xgb.cv(xgb_params, dtrain, num_boost_round = 100, nfold = 3, seed = 42,
               feval = xgb_eval_mae, maximize = False, early_stopping_rounds = 10)
print('CV score:', bst_cv2.iloc[-1, :]['test-mae-mean'])

CV score: 1175.4696043333333
Wall time: 32.2 s

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 5))

ax1.plot(bst_cv2[['train-mae-mean', 'test-mae-mean']])
ax1.set(title = '100 rounds of training', xlabel = 'Rounds', ylabel = 'Loss')
ax1.legend(['Training Loss', 'Test Loss'])

ax2.plot(bst_cv2.iloc[20:][['train-mae-mean', 'test-mae-mean']])
ax2.set(title = '80 last rounds of training', xlabel = 'Rounds', ylabel = 'Loss')
ax2.legend(['Training Loss', 'Test Loss'])

三. 調參

Step 1: 選擇一組初始參數
Step 2: 改變 max_depth 和 min_child_weight.
Step 3: 調節 gamma 降低模型過擬合風險.
Step 4: 調節 subsample 和 colsample_bytree 改變數據採樣策略.
Step 5: 調節學習率 eta.

3.* XGBoostRegressor包

class XGBoostRegressor(object):
    def __init__(self, **kwargs):
        self.params = kwargs
        if 'num_boost_round' in self.params:
            self.num_boost_round = self.params['num_boost_round']
        self.params.update({'silent': 0, 'objective': 'reg:squarederror', 'seed': 42})
    def fit(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, y_train)
        self.bst = xgb.train(params = self.params, dtrain = dtrain, feval = xgb_eval_mae,
                            num_boost_round = self.num_boost_round, maximize = False)
    def predict(self, x_pred):
        dpred = xgb.DMatrix(x_pred)
        return self.bst.predict(dpred)
    def kfold(self, x_train, y_train, nfold = 5):
        dtrain = xgb.DMatrix(x_train, y_train)
        cv_rounds = xgb.cv(params = self.params, dtrain = dtrain, num_boost_round = self.num_boost_round,
                          nfold = nfold, feval = xgb_eval_mae, maximize = False, early_stopping_rounds = 10)
        return cv_rounds.iloc[-1, :]
    def plot_feature_importances(self):
        feat_imp = pd.Series(self.bst.get_fscore()).sort_values(ascending = False)
        feat_imp.plot(title = 'Feature Importances')
        plt.ylabel('Feature Importance Score')
    def get_params(self, deep = True):
        return self.params
    def set_params(self, **params):
        self.params.update(params)
        return self

3.1 初始參數

from sklearn.metrics import mean_absolute_error, make_scorer

def mae_score(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))
mae_scorer = make_scorer(mae_score, greater_is_better = False)

bst = XGBoostRegressor(eta = 0.1, colsample_bytree = 0.5, subsample = 0.5,
                      max_depth = 5, min_child_weight = 3, num_boost_round = 50)
bst.kfold(train_x, train_y, nfold = 5)

GBDT, XGBoost, LightGBM對比

3.2 改變 max_depth 和 min_child_weight

from sklearn.model_selection import GridSearchCV

xgb_param_grid = {'max_depth': list(range(4, 10)), 'min_child_weight': [1, 2, 3, 6]}

grid = GridSearchCV(XGBoostRegressor(eta = 0.1, num_boost_round = 50, colsample_bytree = 0.5, subsample = 0.5),
                    param_grid = xgb_param_grid, scoring = mae_scorer, cv = 5)
grid.fit(train_x, train_y.values)

print(grid.best_score_)
grid.best_params_

-1183.6253147435195
{‘max_depth’: 9, ‘min_child_weight’: 6}

scores = grid.cv_results_['mean_test_score'].reshape(6, 4)

plt.style.use('seaborn-darkgrid')
plt.figure(figsize = (10, 6))
cp = plt.contourf(xgb_param_grid['min_child_weight'], xgb_param_grid['max_depth'], scores, cmap = 'BrBG')
plt.colorbar(cp)
plt.annotate('We use this', xy = (5.95, 8.95), xytext = (4, 8.5), arrowprops = {'facecolor': 'white'})
plt.annotate('Good for depth = 8', xy = (5.95, 8.05), xytext = (4, 7.5), arrowprops = {'facecolor': 'white'})
plt.title('Depth / min_child_weight optimization')
plt.xlabel('min_child_weight')
plt.ylabel('max_depth')

3.3 調節 gamma去降低過擬合風險

%%time

xgb_param_grid = {'gamma': [0.1 * i for i in range(0, 6)]}
grid = GridSearchCV(XGBoostRegressor(eta = 0.1, num_boost_round = 50, max_depth = 9,
                                     min_child_weight = 6, colsample_bytree = 0.5, subsample = 0.5),
                   param_grid = xgb_param_grid, cv = 5, scoring = mae_scorer)
grid.fit(train_x, train_y)

print(grid.best_score_)
print(grid.best_params_)
# grid.cv_results_

-1182.9067965064644
{‘gamma’: 0.4}

3.4 調節樣本採樣方式 subsample 和 colsample_bytree

%%time

xgb_param_grid = {'subsample': [0.1 * i for i in range(6, 9)], 'colsample_bytree': [0.1 * i for i in range(6, 9)]}
grid = GridSearchCV(XGBoostRegressor(eta = 0.1, gamma = 0.3, num_boost_round = 50, max_depth = 9, min_child_weight = 6),
                   param_grid = xgb_param_grid, cv = 5, scoring = mae_scorer)
grid.fit(train_x, train_y.values)

print(grid.best_score_)
print(grid.best_params_)

-1179.5227148125255
{‘colsample_bytree’: 0.7000000000000001, ‘subsample’: 0.8}

scores = grid.cv_results_['mean_test_score'].reshape(3, 3)

plt.figure(figsize = (10, 6))
cp = plt.contourf(xgb_param_grid['subsample'], xgb_param_grid['colsample_bytree'], scores, cmap = 'BrBG')
plt.colorbar(cp)
plt.title('Subsampling params tuning')
plt.xlabel('Subsample')
plt.ylabel('Colsample_bytree')
plt.grid();

3.5 減小學習率並增大樹個數

3.5.1 減小學習率

%%time

xgb_param_grid = {'eta': [0.01, 0.025, 0.05, 0.075, 0.1, 0.2, 0.3, 0.4, 0.5]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round = 50, gamma = 0.3, max_depth = 9,
                                    min_child_weight = 6, colsample_bytree = 0.7, subsample = 0.8),
                   param_grid = xgb_param_grid, cv = 5, scoring = mae_scorer)
grid.fit(train_x, train_y)

print(grid.best_score_)
grid.best_params_

-1163.0047500123446
{‘eta’: 0.2}

eta = xgb_param_grid['eta']
scores = grid.cv_results_['mean_test_score']

plt.figure(figsize = (10, 5))
plt.plot(eta, -scores)
plt.title('MAE and ETA, 50 trees')
plt.xlabel('Eta')
plt.ylabel('Score')

3.5.2 把樹的個數增加到100

%%time

xgb_param_grid = {'eta': [0.01, 0.025, 0.05, 0.075, 0.1, 0.2, 0.3, 0.4, 0.5]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round = 100, gamma = 0.3, max_depth = 9,
                                    min_child_weight = 6, colsample_bytree = 0.7, subsample = 0.8),
                   param_grid = xgb_param_grid, cv = 5, scoring = mae_scorer)
grid.fit(train_x, train_y)

print(grid.best_score_)
print(grid.best_params_)

eta = xgb_param_grid['eta']
scores = grid.cv_results_['mean_test_score']

plt.figure(figsize = (10, 5))
plt.plot(eta, -scores)
plt.title('MAE and ETA, 50 trees')
plt.xlabel('Eta')
plt.ylabel('Score')

-1152.0955028181038
{‘eta’: 0.1}

3.5.3 把樹的個數增加到200

%%time

xgb_param_grid = {'eta': [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round = 200, gamma = 0.3, max_depth = 9,
                                    min_child_weight = 6, colsample_bytree = 0.7, subsample = 0.8),
                   param_grid = xgb_param_grid, cv = 5, scoring = mae_scorer)
grid.fit(train_x, train_y)

print(grid.best_score_)
print(grid.best_params_)

eta = xgb_param_grid['eta']
scores = grid.cv_results_['mean_test_score']

plt.figure(figsize = (10, 5))
plt.plot(eta, -scores)
plt.title('MAE and ETA, 50 trees')
plt.xlabel('Eta')
plt.ylabel('Score')

-1146.114608201662
{‘eta’: 0.06}

3.6 XGBoost最終模型

bst = XGBoostRegressor(num_boost_round = 200, eta = 0.06, gamma = 0.3, max_depth = 9,
                      min_child_weight = 6, colsample_bytree = 0.7, subsample = 0.8)
cv = bst.kfold(train_x, train_y, nfold = 5)

cv

XGBoost實戰: 保險賠償預測

目錄:

一. 查看數據

1.1 離散值

1.2 目標值

1.3 連續值

1.4 數據預處理

二. XGBoost基本模型

三. 調參

3.* XGBoostRegressor包

3.1 初始參數

3.2 改變 max_depth 和 min_child_weight

3.3 調節 gamma去降低過擬合風險

3.4 調節樣本採樣方式 subsample 和 colsample_bytree

3.5 減小學習率並增大樹個數

3.5.1 減小學習率

3.5.2 把樹的個數增加到100

3.5.3 把樹的個數增加到200

3.6 XGBoost最終模型

工作中用到的腳本合集

24-5-18 X

數據挖掘之房價預測任務

協同過濾與隱語義模型推薦系統實例2: 基於相似度的推薦

ARIMA 時間序列2: 評估和參數選擇

時間處理date_range,truncate,Timestamp,Period,Timedelta,resample,rolling

HMM隱馬爾科夫模型與實例2: 預測股票走勢

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結