2019騰訊廣告算法大賽之使用XGBOOST模型+網格搜索輕鬆上80

前三個部分分別介紹瞭如何清洗廣告數據集、用戶數據集、曝光廣告數據集和測試集，以及構如何構造訓練的標籤，具體鏈接見下文，在我們構造好訓練集之後，我們開始使用XGBOOST模型訓練數據集，訓練方法分爲兩個版本，第一個版本是簡單版本，訓練集的屬性列中只包含取唯一值的數據，第二個版本是加上取多值的屬性列。參考的代碼鏈接是bryan大佬18年騰訊算法大賽公佈的baseline。

第一部分：如何清洗廣告數據集和用戶數據集

第二部分：如何清洗曝光廣告數據集以及構造標籤

第三部分：如何整理測試數據集以及構造訓練集

前半年一直做的是GAN生成圖像相關，所以在對這個XGBOOST不是很瞭解，只是簡單參考別人的模型比着葫蘆畫個瓢，版本一代碼能跑同，但是最後提交代碼效果很差，版本二應該是訓練集中的數據出現了非法字符例如 (' ,)一類的跑不通，準備先寫論文就不做了。

版本1：屬性列只是取唯一值加上單調性的話在測試集A上是可以達到79.75分的已經試驗過了

# -*- coding: utf-8 -*-
# @Time    : 2019/5/4 9:11
# @Author  : YYLin
# @Email   : [email protected]
# @File    : Code_For_Tencent.py
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
import sys
from xgboost import plot_importance
from sklearn.preprocessing import Imputer

def loadDataset(filePath):
    df = pd.read_csv(filepath_or_buffer=filePath)
    return df

def featureSet(data):
    data_num = len(data)
    XList = []
    for row in range(0, data_num):
        tmp_list = []
        tmp_list.append(data.iloc[row]['ad_bid'])
        tmp_list.append(data.iloc[row]['Ad_material_size'])
        tmp_list.append(data.iloc[row]['Ad_Industry_Id'])
        tmp_list.append(data.iloc[row]['Commodity_type'])

        # 該參數用來表示投放時間 暫時不使用
        # tmp_list.append(data.iloc[row]['Delivery_time'])
        XList.append(tmp_list)
    yList = data.num_click.values
    return XList, yList

def loadTestData(filePath):
    data = pd.read_csv(filepath_or_buffer=filePath)
    data_num = len(data)
    XList = []
    for row in range(0, data_num):
        tmp_list = []
        tmp_list.append(data.iloc[row]['ad_bid'])
        tmp_list.append(data.iloc[row]['Ad_material_size'])
        tmp_list.append(data.iloc[row]['Ad_Industry_Id'])
        tmp_list.append(data.iloc[row]['Commodity_type'])

        # 該參數用來表示投放時間 暫時不使用
        # tmp_list.append(data.iloc[row]['Delivery_time'])
        XList.append(tmp_list)
    return XList

def trainandTest(X_train, y_train, X_test):
    # XGBoost訓練過程
    model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')
    model.fit(X_train, y_train)

    # 對測試集進行預測 並且對預測結果保留四位有效數字
    ans = model.predict(X_test)

    ans_len = len(ans)
    id_list = np.arange(1, ans_len+1)
    data_arr = []

    # 如果預測的數據長度和定義的數據長度一致 則將其合併保存
    if ans_len == len(id_list):
        for row in range(0, ans_len):
            data_arr.append([int(id_list[row]), round(ans[row], 4)])
    else:
        print("！！！！！測試數據的長度和定義的標籤長度不一致！！！！！")
        sys.exit()

    np_data = np.array(data_arr)
    # 保存結果
    pd_data = pd.DataFrame(np_data)
    # print(pd_data)
    pd_data.to_csv('submission.csv', index=None)

    # 顯示重要特徵
    # plot_importance(model)
    # plt.show()


if __name__ == '__main__':
    trainFilePath = '../Dataset/Result/Result_For_Train_test.csv'
    testFilePath = '../Dataset/Result/Result_For_Test.csv'
    print("！！！！！！！！！正在加載數據集！！！！！！！！！")
    data = loadDataset(trainFilePath)
    print("訓練集中的數據信息是:\n", data.info())
    X_test = loadTestData(testFilePath)
    print("！！！！！！！！！正在構建模型的特徵！！！！！！！！！！！")
    X_train, y_train = featureSet(data)
    print("！！！！！！正在訓練中！！！！！！！！！！")
    trainandTest(X_train, y_train, X_test)

版本2：2019-07-07新加對XGB能夠使用網格搜索自動尋找最優參數，

# -*- coding: utf-8 -*-
# @Time    : 2019/5/21 23:35
# @Author  : YYLin
# @Email   : [email protected]
# @File    : new_submission_used_xgboost_v4.py
# 在模型中加入新的特徵 廣告 例如每日廣告ID的出現次數
# 現在是講LGB轉化成XGBoost階段 暫時不在數據集中增加數據
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
import xgboost as xgb
import numpy as np
import sys
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from xgboost import plot_importance
import math
# v4 加上一些特徵選擇的方法
from sklearn.feature_selection import SelectPercentile

# 05.21能夠完全打印數據的信息
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', None)


# 初始的時候 將該變量設置爲 True 等待尋找好參數之後 然後將其設置成 False 並修改成訓練集中的參數信息
find_params = False
# 對某些變量使用one-hot encoding  對另一些變量使用StandardScaler() encoding
def encode_feature_data(data, for_train=True):
    XList = []
    data_num = len(data)

    if for_train:
        print("*****正在加載訓練數據, 訓練數據的屬性特點是*******", data.info())
    else:
        print("*****正在加載測試數據，測試數據的屬性特點是*******", data.info())

    # 仿照lgb對所有的數據使用one-hot encoding
    enc = OneHotEncoder(categories='auto')
    enc.fit(data)
    enc.transform(data)
    print("Standard_feature 相關的屬性已經編碼完畢")

    if for_train:
        # 對進行編碼處理之後的數據轉化成list用於輸出
        data['Exporse'] = data['Exporse'].apply(lambda x: math.log(x))

        for row in range(0, data_num):
            tmp_list = []
            tmp_list.append(data['ad_request_datetime'][row])
            tmp_list.append(data['ad_account_id'][row])
            tmp_list.append(data['commodity_id'][row])
            tmp_list.append(data['commodity_type'][row])
            tmp_list.append(data['ad_industry_id'][row])
            tmp_list.append(data['ad_martril_size'][row])
            XList.append(tmp_list)
            # Ylist.append(data.iloc[row]['Exporse'])

        Ylist = data.Exporse.values
        return XList, Ylist
    else:
        # 對進行編碼處理之後的數據轉化成list用於輸出
        for row in range(0, data_num):
            tmp_list = []
            tmp_list.append(data['ad_request_datetime'][row])
            tmp_list.append(data['ad_account_id'][row])
            tmp_list.append(data['commodity_id'][row])
            tmp_list.append(data['commodity_type'][row])
            tmp_list.append(data['ad_industry_id'][row])
            tmp_list.append(data['ad_martril_size'][row])
            XList.append(tmp_list)
        return XList


# 定義使用XGBoost訓練模型 需要增加自動調參的代碼
def XGB_predict(X_train, y_train, X_test):

    if find_params:
        # cv_params 表示循行多少次 可以取得最優解 優化其他的參數見https://blog.csdn.net/sinat_35512245/article/details/79700029
        cv_params = {'n_estimators': [4000, 5000, 6000, 7000, 10000]}

        '''
        cv_params = {'n_estimators': [400, 500, 600, 700, 800],  'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_child_weight': [1, 2, 3, 4, 5, 6], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 'subsample': [0.6, 0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3],
                     'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2]}
        '''

        other_params = {'learning_rate': 0.1, 'n_estimators': 5000, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                        'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

        # 首先使用XGBRegressor做迴歸預測 預測結果之後使用 GridSearchCV 選擇最優參數
        model = xgb.XGBRegressor(**other_params)
        optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
        # model.fit(X_train, y_train)
        grid_search = optimized_GBM.fit(X_train, y_train)

        print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))

        means = grid_search.cv_results_['mean_test_score']
        params = grid_search.cv_results_['params']
        for mean, param in zip(means, params):
            print("%f  with:   %r" % (mean, param))
    else:
        # 對測試集進行預測
        model = xgb.XGBRegressor(learning_rate=0.1, n_estimators=6000, max_depth=5, min_child_weight=1, seed=0,
                        subsample=0.8, colsample_bytree=0.8, gamma=0, reg_alpha=0, reg_lambda=1)
        model.fit(X_train, y_train)

        # 對測試集進行預測
        predict_result = model.predict(X_test)
        predict_result = np.exp(predict_result)
        ans_len = len(predict_result)
        id_list = np.arange(1, ans_len + 1)
        data_arr = []

        # 如果預測的數據長度和定義的數據長度一致 則將其合併保存
        if ans_len == len(id_list):
            for row in range(0, ans_len):
                data_arr.append([int(id_list[row]), round(predict_result[row], 4)])
        else:
            print("！！！！！測試數據的長度和定義的標籤長度不一致！！！！！")
            sys.exit()

        # 保存最後的生成結果 該程序沒有增加單調性 需要對結果單獨處理
        # np_data = np.array(data_arr)
        pd_data = pd.DataFrame(data_arr)
        pd_data.to_csv('../Dataset/dataset_for_train/submission.csv', index=None, header=None)

        # 需要手動關閉畫圖的面板 運行程序的時候 暫不執行
        plot_importance(model)
        plt.show()


if __name__ == '__main__':
    trainFilePath = '../Dataset/dataset_for_train/train_op_dp.csv'

    testFilePath = '../Dataset/dataset_for_train/update_Btest_sample.csv'

    print("正在加載數據！！！！構建數據特徵")

    # 對數據進行encoding時 比較慢 所以在測試的時候可以只讀取前10行的數據
    # data = pd.read_csv(trainFilePath, nrows=10)
    data = pd.read_csv(trainFilePath)
    X_train, y_train = encode_feature_data(data)

    # 原始代碼中讀取是文件信息 現在讀取的是類型信息
    print("X_train的類型是:", type(X_train), type(X_train[1]), X_train[1], type(y_train))

    data_test = pd.read_csv(testFilePath)
    X_test = encode_feature_data(data_test, for_train=False)

    print("*******正在訓練中*******")
    XGB_predict(X_train, y_train, X_test)

2019騰訊廣告算法大賽之使用XGBOOST模型+網格搜索輕鬆上80

這種嵌套字典類型的數據，我想把它讀取到df裏，如何操作？

微調真的能讓LLM學到新東西嗎:引入新知識可能讓模型產生更多的幻覺

iNeuOS工業互聯網操作系統，增加電力IEC104協議

微服務實踐k8s&dapr開發部署實驗（3）訂閱發佈

圖像識別模型優化技巧之dropout 學習率遞減 L1、L2正則 bacth normal的對比

2019移動廣告反欺詐算法挑戰賽之初始數據分析

2019移動廣告反欺詐算法挑戰賽baseline

2019騰訊廣告算法大賽之使用XGBOOST模型+網格搜索輕鬆上80

ImportError: No module named 'imgaug'

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

2019騰訊廣告算法大賽之使用XGBOOST模型+網格搜索 輕鬆上80

2019騰訊廣告算法大賽之使用XGBOOST模型+網格搜索輕鬆上80