天池二手車交易價格預測賽題理解之特徵分析模型和調參技巧
原文鏈接:
Datawhale 零基礎入門數據挖掘-Task4 建模調參
本文爲個人閱讀筆記,僅記錄閱讀過程中遇到的新知識。
- 模型
模型的簡單建立
#1.加載模型
from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import Ridge
#from sklearn.linear_model import Lasso
#from sklearn.svm import SVC
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.neural_network import MLPRegressor
#from xgboost.sklearn import XGBRegressor
#from lightgbm.sklearn import LGBMRegressor
#模型實例化
model = LinearRegression(normalize=True)#其他模型類似
#向模型中填充數據
model = model.fit(train_X, train_y)
#模型預測
model.predict(train_X.loc[subsample_index])
- 多個模型對比
models = [LinearRegression(),
DecisionTreeRegressor(),
RandomForestRegressor(),
GradientBoostingRegressor(),
MLPRegressor(solver='lbfgs', max_iter=100),
XGBRegressor(n_estimators = 100, objective='reg:squarederror'),
LGBMRegressor(n_estimators = 100)]
#將各模型結果保存在字典中
result = dict()
for model in models:
model_name = str(model).split('(')[0]
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
result[model_name] = scores
print(model_name + ' is finished')
result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result
- 線性模型的兩種正則化變種
在過濾式和包裹式特徵選擇方法中,特徵選擇過程與學習器訓練過程有明顯的分別。而嵌入式特徵選擇在學習器訓練過程中自動地進行特徵選擇。嵌入式選擇最常用的是L1正則化與L2正則化。在對線性迴歸模型加入兩種正則化方法後,他們分別變成了嶺迴歸與Lasso迴歸。
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
- K折交叉驗證
不把所有的數據集都拿來訓練,而是分出一部分來(這一部分不參加訓練)對訓練集生成的參數進行測試,相對客觀的判斷這些參數對訓練集之外的數據的符合程度。這種思想就稱爲交叉驗證(Cross Validation)。
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
5折交叉驗證
對未處理過標籤的數據進行5折驗證
對處理過標籤的數據進行5折驗證
問題:爲什麼未處理過標籤的數據要定義一個log_transfer函數?
需要注意的是:K折交叉驗證針對的是相互獨立的數據(我的猜測),如果是跟時間相關聯的,最好是取前n-k個數據訓練,最後k個數據驗證。
- 繪製學習曲線
繪製學習率曲線與驗證曲線
from sklearn.model_selection import learning_curve, validation_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspace(.1, 1.0, 5 )):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training example')
plt.ylabel('score')
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()#區域
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color='r',
label="Training score")
plt.plot(train_sizes, test_scores_mean,'o-',color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_ln[:1000], ylim=(0.0, 0.5), cv=5, n_jobs=1)
使用seaborn和matplotlib畫圖,參考資料
教你使用Matplotlib和Seaborn演示Python可視化
6. 模型調參
三種策略:貪心算法,網格調參和貝葉斯調參。
1)貪心調參
best_obj = dict()
for obj in objective:
model = LGBMRegressor(objective=obj)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_obj[obj] = score
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_leaves[leaves] = score
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_depth[depth] = score
通過min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())獲取最優參數。
2)網格調參 (Grid Search 調參)
from sklearn.model_selection import GridSearchCV
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_y)
通過clf.best_params_獲取最優參數值。
3)貝葉斯調參
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
val = cross_val_score(
LGBMRegressor(objective = 'regression_l1',
num_leaves=int(num_leaves),
max_depth=int(max_depth),
subsample = subsample,
min_child_samples = int(min_child_samples)
),
X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
).mean()
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (2, 100),
'max_depth': (2, 100),
'subsample': (0.1, 1),
'min_child_samples' : (2, 100)
}
)
通過rf_bo.maximize()獲取最優參數值。