博客瀏覽
1.https://blog.csdn.net/qq_41032884/article/details/106619158
2.https://blog.csdn.net/qq_41032884/article/details/106638006
3.https://blog.csdn.net/qq_41032884/article/details/106676616
4.https://blog.csdn.net/qq_41032884/article/details/106688866
5.https://blog.csdn.net/qq_41032884/article/details/106708659
6.https://blog.csdn.net/qq_41032884/article/details/106837553
7.https://blog.csdn.net/qq_41032884/article/details/106865137
8.https://blog.csdn.net/qq_41032884/article/details/106959074
9. https://blog.csdn.net/qq_41032884/article/details/106986390
10.https://blog.csdn.net/qq_41032884/article/details/107008654
11.https://blog.csdn.net/qq_41032884/article/details/107025727
工作要點
我的工作主要內容是數據處理和分析,算法建模。主要完成工作爲:
慕課學習記錄的處理和相關算法如gcforest、SVR、線性迴歸等多種算法建模,比較多項指標
特徵工程實現,嘗試多種不同的特徵選擇等方法提高訓練指標
學生其他校園記錄數據的預處理和數據分析
深度學習建模訓練,調參,分析結果,整理提交給同學
工作難點
主要的工作難點有幾個方面:
1.特徵工程 如何進行特徵選擇,優化模型,我採用了多種方法,並且比照多項指標
'''
卡方檢驗選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model=SelectKBest(chi2, k=2)
X_train=model.fit_transform(X_train, y_train)
X_test=model.transform(X_test)
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
'''
遞歸特徵消除法
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
#data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
rfe=RFE(estimator=model, n_features_to_select=2)
X_train=rfe.fit_transform(X_train, y_train)
X_test=rfe.transform(X_test)
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
'''
logistic懲罰項選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
from sklearn.linear_model import LogisticRegression
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
#data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model=SelectFromModel(LogisticRegression(penalty='l2',C=0.1))
X_train=model.fit_transform(X_train,y_train)
X_test=model.transform(X_test)
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
2. 樣本數少的情況下該如何訓練。一開始初步的想法是將任務轉化爲二分類任務,因此一開始的時候嘗試了深度森林
import argparse
import numpy as np
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json
def parse_args():
parser=argparse.ArgumentParser()
parser.add_argument("--model",type=str,default='gcforest',help='Train Model File')
parser.add_argument("--data",type=str,default='mooc_data.txt',help='Dataset')
args=parser.parse_args()
return args
def get_toy_config():
config = {}
ca_config = {}
ca_config["random_state"] = 0
ca_config["max_layers"] = 100
ca_config["early_stopping_rounds"] = 3
ca_config["n_classes"] = 10
ca_config["estimators"] = []
'''
ca_config["estimators"].append(
{"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
"objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} )
ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
'''
ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
config["cascade"] = ca_config
return config
if __name__ == "__main__":
args = parse_args()
if args.model =='gcforest':
config = get_toy_config()
else:
config = load_json(args.model)
file=args.data
dir='data/'+file
if os.path.exists(dir)==False:
raise ValueError("The file does not exist!")
f=open(dir,'r')
lines=f.readlines()
dataset=[]
for line in lines:
cols=line.split(',')
for i in range(len(cols)-1):
cols[i+1]=float(cols[i+1])
col_array=np.array(cols[1:-1])
dataset.append(col_array)
dataset=np.array(dataset)
#dataset=np.loadtxt(dir)
np.random.shuffle(dataset)
index=int(dataset.shape[0]*0.2)
X_train=dataset[:index,:-1]
y_train=dataset[:index,-1]
X_test=dataset[index:,:-1]
y_test=dataset[index:,-1]
gc = GCForest(config)
# If the model you use cost too much memory for you.
# You can use these methods to force gcforest not keeping model in memory
# gc.set_keep_model_in_mem(False), default is TRUE.
X_train = X_train[:, np.newaxis, :]
X_test = X_test[:, np.newaxis, :]
#X_train_enc = gc.fit_transform(X_train, y_train)
# X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
# X_enc.shape =
# (n_datas, n_estimators * n_classes): If cascade is provided
# (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
# You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
# X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test)
# WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use
# gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model.
y_pred = gc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
# You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF.
'''
X_test_enc = gc.transform(X_test)
X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
X_train_origin = X_train.reshape((X_train.shape[0], -1))
X_test_origin = X_test.reshape((X_test.shape[0], -1))
X_train_enc = np.hstack((X_train_origin, X_train_enc))
X_test_enc = np.hstack((X_test_origin, X_test_enc))
print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape))
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)
clf.fit(X_train_enc, y_train)
y_pred = clf.predict(X_test_enc)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100))
'''
# dump
with open("test_gcforest.pkl", "wb") as f:
pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
'''
with open("test.pkl", "rb") as f:
gc = pickle.load(f)
y_pred = gc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
'''
後來認爲該任務如果只簡單劃歸爲二分類未免太過粗暴,可以類比最簡單的房價預測做迴歸模型,採用最簡單粗暴的數據擴增手段,將538個樣本擴增爲一萬多個。
x_raw = load_student_data()
y_raw = load_label()
x_pair = []
y_pair = []
for index in range(len(y_raw)):
for i in range(1, len(y_raw) - index):
y_pair.append(y_raw[index] - y_raw[index + i])
print("y_pair build finish shape: %s" % len(y_pair))
for index in range(len(x_raw)):
for i in range(1, len(x_raw) - index):
x_pair.append(x_raw[index] - x_raw[index + i])
print("x_pair build finish shape: %s" % len(x_pair))
x = np.array(x_pair)
y = np.array(y_pair)
2.深度學習建模 拿到處理後的數據,如何設計出最適合模型的超參數,如網絡層數、神經結點個數、迭代次數等
def deep():
# 網絡搭建
model = Sequential()
model.add(Dense(input_dim=x_train.shape[1], units=1, kernel_initializer='uniform'))
model.add(Activation('relu'))
#model.add(Dense(512))
#model.add(Activation('relu'))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dense(128))
#model.add(Dropout(0.8))
model.add(Activation('relu'))
model.add(Dense(64))
#model.add(Dropout(0.8))
model.add(Activation('relu'))
model.add(Dense(32))
#model.add(Dropout(0.8))
model.add(Activation('relu'))
model.add(Dense(1))
#model.add(Activation('sigmoid'))
model.compile(optimizer='adam', metrics=["mae"], loss='mse')
model.fit(x_train, y_train, batch_size=16, epochs=50)
score = model.evaluate(x_test, y_test, batch_size=16)
print('mse score:', score[0])
print('mae score:', score[1])
#W, b = model.layers[0].get_weights()
#print('Weights=', W, '\n biases=', b)
# plotting the prediction
y_pred = model.predict(x_test)
print("raw y_pred")
#print(y_pred)
for mindex in y_pred:
mindex[0] = int(mindex[0])
if mindex[0]<0:
mindex[0]=0-mindex[0]
print(mindex[0])
#print("--------------打印預測結果與實際值--------------")
# y_pred = y_pred.astype(int)
# for mindex in zip(y_test, y_pred):
# print("y_test", mindex[0], "| y_pred", mindex[1])
#
#print("--------------預測結果與實際值打印完畢-----------")
deep()
總結
拿到數據後最先做的是數據分析
實際應用中模型其實往往並不複雜,重要的是對數據的處理和特徵工程
要進行多方面比較,多調整模型,多調整參數,採用多個指標,總結歸納出最合適的方法