山東大學實訓 Day2

上午對慕課學習記錄做了預處理

原來的數據樣例:

處理代碼:

'''
對mooc上課記錄數據的預處理
'''
import pandas as pd
import numpy as np
import json
if __name__ == '__main__':
    mooc = pd.read_csv('mooc_data.csv', header=0, index_col=None)
    course_list = list(set(list(mooc['課程名稱'])))
    grade_list = list(set(list(mooc['年級'])))
    group_list = list(set(list(mooc['分組'])))
    course_id_dict = {}
    id_course_dict = {}
    grade_id_dict = {}
    id_grade_dict = {}
    group_id_dict = {}
    id_group_dict = {}
    for i in range(len(course_list)):
        course_id_dict[course_list[i]] = i
        id_course_dict[i] = course_list[i]
    for i in range(len(grade_list)):
        grade_id_dict[grade_list[i]] = i
        id_grade_dict[i] = grade_list[i]
    for i in range(len(group_list)):
        group_id_dict[group_list[i]] = i
        id_group_dict[i] = group_list[i]
    file_dict = {}
    file_dict['course_id_dict'] = course_id_dict
    file_dict['grade_id_dict'] = grade_id_dict
    file_dict['group_id_dict'] = group_id_dict
    file_dict['id_course_dict'] = id_course_dict
    file_dict['id_grade_dict'] = id_grade_dict
    file_dict['id_group_dict'] = id_group_dict
    f = open('dictionary.json', 'w')
    s = json.dumps(file_dict, ensure_ascii=False)
    f.write(s)
    f.close()
    for i in range(len(mooc)):
        group_content = mooc.iloc[i, 2]
        mooc.iloc[i, 2] = group_id_dict[group_content]
        course_content = mooc.iloc[i, 4]
        mooc.iloc[i, 4] = course_id_dict[course_content]
        grade_content = mooc.iloc[i, 3]
        mooc.iloc[i, 3] = grade_id_dict[grade_content]
    del mooc['課程編號']
    del mooc['班級']
    mooc.to_csv('mooc_data.txt', header=0, index=0)


處理結果:

dictionary.txt

mooc_data.csv

下載Github代碼,放入到自己的工作文件夾下

訓練:

import  argparse
import numpy as np
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json
def parse_args():
    parser=argparse.ArgumentParser()
    parser.add_argument("--model",type=str,default='gcforest',help='Train Model File')
    parser.add_argument("--data",type=str,default='mooc_data.txt',help='Dataset')

    args=parser.parse_args()
    return args
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = 0
    ca_config["max_layers"] = 100
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 10
    ca_config["estimators"] = []
    ca_config["estimators"].append(
            {"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
             "objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} )
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config


if __name__ == "__main__":
    args = parse_args()
    if args.model =='gcforest':
        config = get_toy_config()
    else:
        config = load_json(args.model)
    file=args.data
    dir='dataset/'+file
    if os.path.exists(dir)==False:
        raise ValueError("The file does not exist!")

    dataset=np.loadtxt(dir)
    np.random.shuffle(dataset)
    index=int(dataset.shape[0]*0.2)
    X_train=dataset[:index,1:-1]
    y_train=dataset[:index,-1]
    X_test=dataset[index:,1:-1]
    y_test=dataset[index:,-1]



    gc = GCForest(config)
    # If the model you use cost too much memory for you.
    # You can use these methods to force gcforest not keeping model in memory
    # gc.set_keep_model_in_mem(False), default is TRUE.


    X_train = X_train[:, np.newaxis, :]
    X_test = X_test[:, np.newaxis, :]

    X_train_enc = gc.fit_transform(X_train, y_train)
    # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
    # X_enc.shape =
    #   (n_datas, n_estimators * n_classes): If cascade is provided
    #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
    # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
    # X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test)
    # WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use
    # gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model.

    y_pred = gc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))

    # You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF.
    '''
    X_test_enc = gc.transform(X_test)
    X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
    X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
    X_train_origin = X_train.reshape((X_train.shape[0], -1))
    X_test_origin = X_test.reshape((X_test.shape[0], -1))
    X_train_enc = np.hstack((X_train_origin, X_train_enc))
    X_test_enc = np.hstack((X_test_origin, X_test_enc))
    print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape))
    clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)
    clf.fit(X_train_enc, y_train)
    y_pred = clf.predict(X_test_enc)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100))
    '''


    # dump
    with open("test_gcforest.pkl", "wb") as f:
        pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
    '''
    # load
    with open("test.pkl", "rb") as f:
        gc = pickle.load(f)
    y_pred = gc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
    '''

還沒寫完,明早繼續 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章