前三個部分分別介紹瞭如何清洗廣告數據集、用戶數據集、曝光廣告數據集和測試集,以及構如何構造訓練的標籤,具體鏈接見下文,在我們構造好訓練集之後,我們開始使用XGBOOST模型訓練數據集,訓練方法分爲兩個版本,第一個版本是簡單版本,訓練集的屬性列中只包含取唯一值的數據,第二個版本是加上取多值的屬性列。參考的代碼鏈接是bryan大佬18年騰訊算法大賽公佈的baseline。
第一部分: 如何清洗廣告數據集和用戶數據集
第二部分: 如何清洗曝光廣告數據集以及構造標籤
第三部分: 如何整理測試數據集以及構造訓練集
前半年一直做的是GAN生成圖像相關,所以在對這個XGBOOST不是很瞭解,只是簡單參考別人的模型比着葫蘆畫個瓢,版本一代碼能跑同,但是最後提交代碼效果很差,版本二應該是訓練集中的數據出現了非法字符例如 (' ,)一類的跑不通,準備先寫論文就不做了。
版本1:屬性列只是取唯一值
# -*- coding: utf-8 -*-
# @Time : 2019/5/4 9:11
# @Author : YYLin
# @Email : [email protected]
# @File : Code_For_Tencent.py
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
import sys
from xgboost import plot_importance
from sklearn.preprocessing import Imputer
def loadDataset(filePath):
df = pd.read_csv(filepath_or_buffer=filePath)
return df
def featureSet(data):
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['ad_bid'])
tmp_list.append(data.iloc[row]['Ad_material_size'])
tmp_list.append(data.iloc[row]['Ad_Industry_Id'])
tmp_list.append(data.iloc[row]['Commodity_type'])
# 該參數用來表示投放時間 暫時不使用
# tmp_list.append(data.iloc[row]['Delivery_time'])
XList.append(tmp_list)
yList = data.num_click.values
return XList, yList
def loadTestData(filePath):
data = pd.read_csv(filepath_or_buffer=filePath)
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['ad_bid'])
tmp_list.append(data.iloc[row]['Ad_material_size'])
tmp_list.append(data.iloc[row]['Ad_Industry_Id'])
tmp_list.append(data.iloc[row]['Commodity_type'])
# 該參數用來表示投放時間 暫時不使用
# tmp_list.append(data.iloc[row]['Delivery_time'])
XList.append(tmp_list)
return XList
def trainandTest(X_train, y_train, X_test):
# XGBoost訓練過程
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')
model.fit(X_train, y_train)
# 對測試集進行預測 並且對預測結果保留四位有效數字
ans = model.predict(X_test)
ans_len = len(ans)
id_list = np.arange(1, ans_len+1)
data_arr = []
# 如果預測的數據長度和定義的數據長度一致 則將其合併保存
if ans_len == len(id_list):
for row in range(0, ans_len):
data_arr.append([int(id_list[row]), round(ans[row], 4)])
else:
print("!!!!!測試數據的長度和定義的標籤長度不一致!!!!!")
sys.exit()
np_data = np.array(data_arr)
# 保存結果
pd_data = pd.DataFrame(np_data)
# print(pd_data)
pd_data.to_csv('submission.csv', index=None)
# 顯示重要特徵
# plot_importance(model)
# plt.show()
if __name__ == '__main__':
trainFilePath = '../Dataset/Result/Result_For_Train_test.csv'
testFilePath = '../Dataset/Result/Result_For_Test.csv'
print("!!!!!!!!!正在加載數據集!!!!!!!!!")
data = loadDataset(trainFilePath)
print("訓練集中的數據信息是:\n", data.info())
X_test = loadTestData(testFilePath)
print("!!!!!!!!!正在構建模型的特徵!!!!!!!!!!!")
X_train, y_train = featureSet(data)
print("!!!!!!正在訓練中!!!!!!!!!!")
trainandTest(X_train, y_train, X_test)
版本2:訓練中增加屬性列取多值的情況
# -*- coding: utf-8 -*-
# @Time : 2019/5/4 10:29
# @Author : YYLin
# @Email : [email protected]
# @File : Code_For_Tencent_Improve_V2.py
# 該模型的提升方案是首先使用均值對數據集進行補齊操作 然後對於特殊的字段使用獨特的編碼方式
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
import numpy as np
from xgboost import plot_importance
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import sys
def featureSet(data):
# 首先是對缺失數據使用均值進行填充 缺失數據集中的屬性大多是多值 使用均值填充效果應該不好 暫時不使用
'''
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, ['area', 'status', 'behavior', 'age', 'gender', 'education','device', 'consuptionAbility',
'connectionType']])
x_new = imputer.transform(data.loc[:, ['area', 'status', 'behavior', 'age', 'gender', 'education','device',
'consuptionAbility', 'connectionType']])
'''
# 對不同的屬性列使用不同編碼方式 下列屬性使用one-hot encoding的數據類型
att_ad_bid = LabelEncoder().fit_transform(data['ad_bid'].apply(int))
att_Ad_Industry_Id = LabelEncoder().fit_transform(data['Ad_Industry_Id'].apply(int))
att_Commodity_type = LabelEncoder().fit_transform(data['Commodity_type'].apply(int))
att_Ad_material_size = LabelEncoder().fit_transform(data['Ad_material_size'].apply(int))
# 開始處理多值屬性 參考代碼見
cv = CountVectorizer()
cv.fit(data['Delivery_time'])
train_Delivery_time = cv.transform(data['Delivery_time'])
train_area = cv.fit_transform(data['area']).values.astype('U')
cv.fit(data['status'])
train_status = cv.transform(data['status'])
cv.fit(data['behavior'])
train_behavior = cv.transform(data['behavior'])
cv.fit(data['age'])
train_age = cv.transform(data['age'])
cv.fit(data['gender'])
train_gender = cv.transform(data['gender'])
cv.fit(data['education'])
train_education = cv.transform(data['education'])
cv.fit(data['device'])
train_device = cv.transform(data['device'])
cv.fit(data['consuptionAbility'])
train_consuptionAbility = cv.transform(data['consuptionAbility'])
cv.fit(data['connectionType'])
train_connectionType = cv.transform(data['connectionType'])
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(att_ad_bid[row])
tmp_list.append(att_Ad_Industry_Id[row])
tmp_list.append(att_Commodity_type[row])
tmp_list.append(att_Ad_material_size[row])
# 以下屬性爲多值屬性
tmp_list.append(train_Delivery_time[row])
tmp_list.append(train_area[row])
tmp_list.append(train_status[row])
tmp_list.append(train_behavior[row])
tmp_list.append(train_age[row])
tmp_list.append(train_gender[row])
tmp_list.append(train_education[row])
tmp_list.append(train_device[row])
tmp_list.append(train_consuptionAbility[row])
tmp_list.append(train_connectionType[row])
XList.append(tmp_list)
yList = data.num_click.values
return XList, yList
def loadTestData(filePath):
data = pd.read_csv(filepath_or_buffer=filePath)
# 對不同的屬性列使用不同編碼方式 下列屬性使用one-hot encoding的數據類型
att_ad_bid = LabelEncoder().fit_transform(data['ad_bid'].apply(int))
att_Ad_Industry_Id = LabelEncoder().fit_transform(data['Ad_Industry_Id'].apply(int))
att_Commodity_type = LabelEncoder().fit_transform(data['Commodity_type'].apply(int))
att_Ad_material_size = LabelEncoder().fit_transform(data['Ad_material_size'].apply(int))
# 開始處理多值屬性 參考代碼見
cv = CountVectorizer()
cv.fit(data['Delivery_time'])
train_Delivery_time = cv.transform(data['Delivery_time'])
cv.fit(data['area'])
train_area = cv.transform(data['area'])
cv.fit(data['status'])
train_status = cv.transform(data['status'])
cv.fit(data['behavior'])
train_behavior = cv.transform(data['behavior'])
cv.fit(data['age'])
train_age = cv.transform(data['age'])
cv.fit(data['gender'])
train_gender = cv.transform(data['gender'])
cv.fit(data['education'])
train_education = cv.transform(data['education'])
cv.fit(data['device'])
train_device = cv.transform(data['device'])
cv.fit(data['consuptionAbility'])
train_consuptionAbility = cv.transform(data['consuptionAbility'])
cv.fit(data['connectionType'])
train_connectionType = cv.transform(data['connectionType'])
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(att_ad_bid[row])
tmp_list.append(att_Ad_Industry_Id[row])
tmp_list.append(att_Commodity_type[row])
tmp_list.append(att_Ad_material_size[row])
# 以下屬性爲多值屬性
tmp_list.append(train_Delivery_time[row])
tmp_list.append(train_area[row])
tmp_list.append(train_status[row])
tmp_list.append(train_behavior[row])
tmp_list.append(train_age[row])
tmp_list.append(train_gender[row])
tmp_list.append(train_education[row])
tmp_list.append(train_device[row])
tmp_list.append(train_consuptionAbility[row])
tmp_list.append(train_connectionType[row])
XList.append(tmp_list)
XList.append(tmp_list)
return XList
def trainandTest(X_train, y_train, X_test):
# XGBoost訓練過程
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=500, silent=False, objective='reg:gamma')
model.fit(X_train, y_train)
# 對測試集進行預測
ans = model.predict(X_test)
ans_len = len(ans)
id_list = np.arange(1, len(ans)+1)
data_arr = []
# 增加一個判斷語句 判斷標籤和測試集中的長度是否相同 如果不同的話 則報錯
if ans_len == len(id_list):
for row in range(0, ans_len):
data_arr.append([int(id_list[row]), ans[row]])
else:
print("!!!!!測試數據的長度和定義的標籤長度不一致!!!!!")
sys.exit()
# 寫入文件
np_data = np.array(data_arr)
pd_data = pd.DataFrame(np_data)
pd_data.to_csv('submission.csv', index=None, header=None)
# 顯示重要特徵
# plot_importance(model)
# plt.show()
if __name__ == '__main__':
trainFilePath = '../Dataset/Result/train_100.csv'
testFilePath = '../Dataset/Result/test_100.csv'
print("!!!!!!!!!正在加載數據集!!!!!!!!!")
data = pd.read_csv(trainFilePath)
print("訓練集中的數據信息是:\n", data.info())
print("!!!!!!!!!正在構建模型的特徵!!!!!!!!!!!")
X_train, y_train = featureSet(data)
print("!!!!!!正在訓練中!!!!!!!!!!")
X_test = loadTestData(testFilePath)
trainandTest(X_train, y_train, X_test)