山東大學實訓彙報

博客瀏覽

1.https://blog.csdn.net/qq_41032884/article/details/106619158

2.https://blog.csdn.net/qq_41032884/article/details/106638006

3.https://blog.csdn.net/qq_41032884/article/details/106676616

4.https://blog.csdn.net/qq_41032884/article/details/106688866

5.https://blog.csdn.net/qq_41032884/article/details/106708659

6.https://blog.csdn.net/qq_41032884/article/details/106837553

7.https://blog.csdn.net/qq_41032884/article/details/106865137

8.https://blog.csdn.net/qq_41032884/article/details/106959074

9. https://blog.csdn.net/qq_41032884/article/details/106986390

10.https://blog.csdn.net/qq_41032884/article/details/107008654

11.https://blog.csdn.net/qq_41032884/article/details/107025727

 

工作要點

我的工作主要內容是數據處理和分析,算法建模。主要完成工作爲:

慕課學習記錄的處理和相關算法如gcforest、SVR、線性迴歸等多種算法建模,比較多項指標

特徵工程實現,嘗試多種不同的特徵選擇等方法提高訓練指標

學生其他校園記錄數據的預處理和數據分析

深度學習建模訓練,調參,分析結果,整理提交給同學

工作難點

主要的工作難點有幾個方面:

1.特徵工程 如何進行特徵選擇,優化模型,我採用了多種方法,並且比照多項指標

'''
卡方檢驗選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet  # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score  # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score  # 批量導入指標算法

from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor  # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]


for root,dirs,files in os.walk('data'):
    for name in files:
        file.append(os.path.join(root, name))
for f in file:
    if f=='data/.DS_Store':
        continue
    
    data=pd.read_csv(f,header=None,index_col=0)
    data_=data.iloc[:,3:]
    #print(data)
    data_=data_.sample(frac=1)
    dataset=np.array(data_)
#dataset=np.loadtxt(dir)
    index=int(dataset.shape[0]*0.8)
    data_x=dataset[:,:-1]
    data_y=dataset[:,-1]
    
    
    
    X_train=data_x[:index,:]
    y_train=data_y[:index]
    X_test=data_x[index:,:]
    y_test=data_y[index:]
    model=SelectKBest(chi2, k=2)
    X_train=model.fit_transform(X_train, y_train)
    X_test=model.transform(X_test)
    
    
    
 




    model_br = BayesianRidge()  # 建立貝葉斯嶺迴歸模型對象
    model_lr = LinearRegression()  # 建立普通線性迴歸模型對象
    model_etc = ElasticNet()  # 建立彈性網絡迴歸模型對象
    model_svr = SVR()  # 建立支持向量機迴歸模型對象
    model_gbr = GradientBoostingRegressor()  # 建立梯度增強迴歸模型對象
    model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR']  # 不同模型的名稱列表
    model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr]  # 不同迴歸模型對象的集合
    cv_score_list = []  # 交叉檢驗結果列表
    pre_y_list = []  # 各個迴歸模型預測的y值列表


    for model in model_dic:  # 讀出每個迴歸模型對象
        scores = cross_val_score(model, X_train, y_train, cv=5)  # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
        cv_score_list.append(scores)  # 將交叉檢驗結果存入結果列表
        pre_y_list.append(model.fit(X_train, y_train).predict(X_test))  # 將回歸訓練中得到的預測y存入列表

    model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 迴歸評估指標對象集
    model_metrics_list = []  # 迴歸評估指標列表
    for i in range(5):  # 循環每個模型索引
        tmp_list = []  # 每個內循環的臨時結果列表
        for m in model_metrics_name:  # 循環每個指標對象
            tmp_score = m(y_test, pre_y_list[i])  # 計算每個迴歸指標結果
            tmp_list.append(tmp_score)  # 將結果存入每個內循環的臨時結果列表
        model_metrics_list.append(tmp_list)  # 將結果存入迴歸評估指標列表

    df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立迴歸指標的數據框

    print('='*10,f,'='*10)
    

    print (df2)
    
'''
遞歸特徵消除法
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet  # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score  # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score  # 批量導入指標算法

from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor  # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]

for root,dirs,files in os.walk('data'):
    for name in files:
        file.append(os.path.join(root, name))
for f in file:
    if f=='data/.DS_Store':
        continue
    
    data=pd.read_csv(f,header=None,index_col=0)
    data_=data.iloc[:,3:]
    #print(data)
    data_=data_.sample(frac=1)
    dataset=np.array(data_)
#dataset=np.loadtxt(dir)
    index=int(dataset.shape[0]*0.8)
    data_x=dataset[:,:-1]
    data_y=dataset[:,-1]
    
    #data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
    X_train=data_x[:index,:]
    y_train=data_y[:index]
    X_test=data_x[index:,:]
    y_test=data_y[index:]
    
   
    
    
 




    model_br = BayesianRidge()  # 建立貝葉斯嶺迴歸模型對象
    model_lr = LinearRegression()  # 建立普通線性迴歸模型對象
    model_etc = ElasticNet()  # 建立彈性網絡迴歸模型對象
    model_svr = SVR()  # 建立支持向量機迴歸模型對象
    model_gbr = GradientBoostingRegressor()  # 建立梯度增強迴歸模型對象
    model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR']  # 不同模型的名稱列表
    model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr]  # 不同迴歸模型對象的集合
    cv_score_list = []  # 交叉檢驗結果列表
    pre_y_list = []  # 各個迴歸模型預測的y值列表


    for model in model_dic:  # 讀出每個迴歸模型對象
        scores = cross_val_score(model, X_train, y_train, cv=5)  # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
        cv_score_list.append(scores)  # 將交叉檢驗結果存入結果列表
        rfe=RFE(estimator=model, n_features_to_select=2)
        X_train=rfe.fit_transform(X_train, y_train)
        X_test=rfe.transform(X_test)
        
    
        pre_y_list.append(model.fit(X_train, y_train).predict(X_test))  # 將回歸訓練中得到的預測y存入列表

    model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 迴歸評估指標對象集
    model_metrics_list = []  # 迴歸評估指標列表
    for i in range(5):  # 循環每個模型索引
        tmp_list = []  # 每個內循環的臨時結果列表
        for m in model_metrics_name:  # 循環每個指標對象
            tmp_score = m(y_test, pre_y_list[i])  # 計算每個迴歸指標結果
            tmp_list.append(tmp_score)  # 將結果存入每個內循環的臨時結果列表
        model_metrics_list.append(tmp_list)  # 將結果存入迴歸評估指標列表

    df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立迴歸指標的數據框

    print('='*10,f,'='*10)
    

    print (df2)
    

 

'''
logistic懲罰項選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet  # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score  # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score  # 批量導入指標算法

from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor  # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
from sklearn.linear_model import LogisticRegression
import os
file=[]

for root,dirs,files in os.walk('data'):
    for name in files:
        file.append(os.path.join(root, name))
for f in file:
    if f=='data/.DS_Store':
        continue
    
    data=pd.read_csv(f,header=None,index_col=0)
    data_=data.iloc[:,3:]
    #print(data)
    data_=data_.sample(frac=1)
    dataset=np.array(data_)
#dataset=np.loadtxt(dir)
    index=int(dataset.shape[0]*0.8)
    data_x=dataset[:,:-1]
    data_y=dataset[:,-1]
    
    #data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
    X_train=data_x[:index,:]
    y_train=data_y[:index]
    X_test=data_x[index:,:]
    y_test=data_y[index:]
    
    model=SelectFromModel(LogisticRegression(penalty='l2',C=0.1))
    X_train=model.fit_transform(X_train,y_train)
    X_test=model.transform(X_test)
    
   
    
    
 




    model_br = BayesianRidge()  # 建立貝葉斯嶺迴歸模型對象
    model_lr = LinearRegression()  # 建立普通線性迴歸模型對象
    model_etc = ElasticNet()  # 建立彈性網絡迴歸模型對象
    model_svr = SVR()  # 建立支持向量機迴歸模型對象
    model_gbr = GradientBoostingRegressor()  # 建立梯度增強迴歸模型對象
    model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR']  # 不同模型的名稱列表
    model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr]  # 不同迴歸模型對象的集合
    cv_score_list = []  # 交叉檢驗結果列表
    pre_y_list = []  # 各個迴歸模型預測的y值列表


    for model in model_dic:  # 讀出每個迴歸模型對象
        scores = cross_val_score(model, X_train, y_train, cv=5)  # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
        cv_score_list.append(scores)  # 將交叉檢驗結果存入結果列表
        
    
        pre_y_list.append(model.fit(X_train, y_train).predict(X_test))  # 將回歸訓練中得到的預測y存入列表

    model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 迴歸評估指標對象集
    model_metrics_list = []  # 迴歸評估指標列表
    for i in range(5):  # 循環每個模型索引
        tmp_list = []  # 每個內循環的臨時結果列表
        for m in model_metrics_name:  # 循環每個指標對象
            tmp_score = m(y_test, pre_y_list[i])  # 計算每個迴歸指標結果
            tmp_list.append(tmp_score)  # 將結果存入每個內循環的臨時結果列表
        model_metrics_list.append(tmp_list)  # 將結果存入迴歸評估指標列表

    df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立迴歸指標的數據框

    print('='*10,f,'='*10)
    

    print (df2)
    

2. 樣本數少的情況下該如何訓練。一開始初步的想法是將任務轉化爲二分類任務,因此一開始的時候嘗試了深度森林

import  argparse
import numpy as np
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json
def parse_args():
    parser=argparse.ArgumentParser()
    parser.add_argument("--model",type=str,default='gcforest',help='Train Model File')
    parser.add_argument("--data",type=str,default='mooc_data.txt',help='Dataset')

    args=parser.parse_args()
    return args
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = 0
    ca_config["max_layers"] = 100
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 10
    ca_config["estimators"] = []
    '''
    ca_config["estimators"].append(
            {"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
             "objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} )
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    
    '''
    ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config


if __name__ == "__main__":
    args = parse_args()
    if args.model =='gcforest':
        config = get_toy_config()
    else:
        config = load_json(args.model)
    file=args.data
    dir='data/'+file
    if os.path.exists(dir)==False:
        raise ValueError("The file does not exist!")
    f=open(dir,'r')
    lines=f.readlines()
    dataset=[]
    for line in lines:
        cols=line.split(',')
        for i in range(len(cols)-1):
            cols[i+1]=float(cols[i+1])
        col_array=np.array(cols[1:-1])
        dataset.append(col_array)
    dataset=np.array(dataset)

    #dataset=np.loadtxt(dir)
    np.random.shuffle(dataset)
    index=int(dataset.shape[0]*0.2)
    X_train=dataset[:index,:-1]
    y_train=dataset[:index,-1]
    X_test=dataset[index:,:-1]
    y_test=dataset[index:,-1]



    gc = GCForest(config)
    # If the model you use cost too much memory for you.
    # You can use these methods to force gcforest not keeping model in memory
    # gc.set_keep_model_in_mem(False), default is TRUE.


    X_train = X_train[:, np.newaxis, :]
    X_test = X_test[:, np.newaxis, :]

    #X_train_enc = gc.fit_transform(X_train, y_train)
    # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
    # X_enc.shape =
    #   (n_datas, n_estimators * n_classes): If cascade is provided
    #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
    # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
    # X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test)
    # WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use
    # gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model.

    y_pred = gc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))

    # You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF.
    '''
    X_test_enc = gc.transform(X_test)
    X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
    X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
    X_train_origin = X_train.reshape((X_train.shape[0], -1))
    X_test_origin = X_test.reshape((X_test.shape[0], -1))
    X_train_enc = np.hstack((X_train_origin, X_train_enc))
    X_test_enc = np.hstack((X_test_origin, X_test_enc))
    print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape))
    clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)
    clf.fit(X_train_enc, y_train)
    y_pred = clf.predict(X_test_enc)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100))
    '''


    # dump
    with open("test_gcforest.pkl", "wb") as f:
        pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)

    '''
    with open("test.pkl", "rb") as f:
        gc = pickle.load(f)
    y_pred = gc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
    '''

後來認爲該任務如果只簡單劃歸爲二分類未免太過粗暴,可以類比最簡單的房價預測做迴歸模型,採用最簡單粗暴的數據擴增手段,將538個樣本擴增爲一萬多個。

x_raw = load_student_data()
y_raw = load_label()

x_pair = []
y_pair = []

for index in range(len(y_raw)):
    for i in range(1, len(y_raw) - index):
        y_pair.append(y_raw[index] - y_raw[index + i])
print("y_pair build finish shape: %s" % len(y_pair))

for index in range(len(x_raw)):
    for i in range(1, len(x_raw) - index):
        x_pair.append(x_raw[index] - x_raw[index + i])
print("x_pair build finish shape: %s" % len(x_pair))
x = np.array(x_pair)
y = np.array(y_pair)

 

2.深度學習建模 拿到處理後的數據,如何設計出最適合模型的超參數,如網絡層數、神經結點個數、迭代次數等

def deep():
    
    # 網絡搭建
    model = Sequential()
    model.add(Dense(input_dim=x_train.shape[1], units=1, kernel_initializer='uniform'))
    model.add(Activation('relu'))
    #model.add(Dense(512))
    #model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(128))
    #model.add(Dropout(0.8))
    model.add(Activation('relu'))
    model.add(Dense(64))
    #model.add(Dropout(0.8))
    model.add(Activation('relu'))
    model.add(Dense(32))
    #model.add(Dropout(0.8))
    model.add(Activation('relu'))
    model.add(Dense(1))
    #model.add(Activation('sigmoid'))
    model.compile(optimizer='adam', metrics=["mae"], loss='mse')
    model.fit(x_train, y_train, batch_size=16, epochs=50)
    score = model.evaluate(x_test, y_test, batch_size=16)
    print('mse score:', score[0])
    print('mae score:', score[1])
    #W, b = model.layers[0].get_weights()

    #print('Weights=', W, '\n biases=', b)

    # plotting the prediction
    y_pred = model.predict(x_test)
    print("raw y_pred")
    #print(y_pred)
    for mindex in y_pred:
        mindex[0] = int(mindex[0])
        if mindex[0]<0:
            mindex[0]=0-mindex[0]
        print(mindex[0])

    #print("--------------打印預測結果與實際值--------------")
    # y_pred = y_pred.astype(int)
    # for mindex in zip(y_test, y_pred):
    #     print("y_test", mindex[0], "| y_pred", mindex[1])
    #
    #print("--------------預測結果與實際值打印完畢-----------")

deep()

總結

拿到數據後最先做的是數據分析

實際應用中模型其實往往並不複雜,重要的是對數據的處理和特徵工程

要進行多方面比較,多調整模型,多調整參數,採用多個指標,總結歸納出最合適的方法

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章