山東大學實訓 Day8

總算是面試完了,寫這篇博客還沒出結果...

拿到了比較全的數據集,該數據集裏包含了學生的圖書館門禁、借書和消費記錄,以此來預測成績。

做了一下數據的處理:


import numpy as np
import sys
from sklearn import preprocessing



def split_period(file_name):
    first_period = []
    second_period = []
    third_period = []
    raw = np.genfromtxt(file_name)[1:]
    for x in range(len(raw)):
        if raw[x][0] == 1:
            first_period.append(raw[x][1:])
        if raw[x][0] == 2:
            second_period.append(raw[x][1:])
        if raw[x][0] == 3:
            third_period.append(raw[x][1:])
    first_period = np.array(first_period)
    second_period = np.array(second_period)
    third_period = np.array(third_period)
    return first_period, second_period, third_period


def split_term_consumption(file_name):
    first_period = []
    second_period = []
    third_period = []
    data = np.genfromtxt(file_name, dtype=str)[1:]
    for index in data:
        if index[0] == '1':
            # 學號 地點 日期 時間 金額
            first_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
        elif index[0] == '2':
            second_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
        elif index[0] == '3':
            third_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
   
    return first_period, second_period, third_period


def split_term_consumption_npy(file_name):
    first_period = []
    second_period = []
    third_period = []
    data = np.load(file_name)[1:]
    for index in data:
        if index[0] == '1':
            # 學號 地點 日期 時間 金額
            first_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
        elif index[0] == '2':
            second_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
        elif index[0] == '3':
            third_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
    # first_period = np.array(first_period)
    # second_period = np.array(second_period)
    # third_period = np.array(third_period)
    return first_period, second_period, third_period

def counter(larray):
    counters = []
    for x in np.arange(1, 539, 1):
        y = larray == x
        z = larray[y]
        counters.append((x, z.size))
    return np.array(counters)


def counter_health_eating_consumption(consumption):
    money_counters = []
    for x in np.arange(1, 539, 1):
        money_counters.append([x, 0])
    for index in consumption:
        # print("student id :",int(index[0]))
        # print("money_counters[int(index[0])]",money_counters[int(index[0])-1])
        if index[1] == '\xca\xb3\xcc\xc3':
            # 三餐均規律
            if index[3] > 63000 < 83000:
                # 三餐按時喫的金額
                # money_counters[int(index[0]) - 1][1] += index[4]
                # 三餐按時喫的頻次
                money_counters[int(index[0]) - 1][1] += 1
    return np.array(money_counters)


def counter__consumption(consumption):
    money_counters = []
    for x in np.arange(1, 539, 1):
        money_counters.append([x, 0])
    for index in consumption:
        # print("student id :",int(index[0]))
        # print("money_counters[int(index[0])]",money_counters[int(index[0])-1])
        money_counters[int(index[0]) - 1][1] += index[4]
    return np.array(money_counters)


def catalog2list():
    catalog = []
    if sys.version > '3':
        book_catalog = np.load('./data/catalog.npy')
        for index in range(len(book_catalog)):
            catalog.append([book_catalog[index][0].decode(), book_catalog[index][1].decode()])
    else:
        
        catalog = np.load('./data/catalog.npy')
    return catalog


def counter_book(book_list):
    book = dict()
    for x in book_list:
        if book.get(x[1]) is not None:
            book[x[1]] += 1
        else:
            book[x[1]] = 1
    return book


score_1st, score_2nd, score_3rd = split_period('./data/score.txt')
access_1st, access_2nd, access_3rd = split_period('./data/access.txt')

consumption_1st, consumption_2nd, consumption_3rd = split_term_consumption_npy('./data/consumption.npy') 
borrow_1st, borrow_2nd, borrow_3rd = split_period('./data/borrow.txt')

# 處理圖書分類
lib_list = catalog2list()
lib_dict = dict(lib_list)
# print(lib_list)
# print("book status")
# # 不同書類別的數量
# book_dict = counter_book(lib_list)
# book_label = book_dict.keys()
# print(book_dict)
# print("book_label")
# # 書類別標籤
# print(book_label)
print("")
# 第三學期學生借書頻次記錄
borrow_3rd_time = counter(np.sort(borrow_3rd[:, 0]))
# 第三學期學生進入圖書館頻次記錄
access_3rd_times = counter(np.sort(access_3rd[:, 0]))
# 第三學期學生綜合排名記錄
score_3rd_times = score_3rd[score_3rd[:, 0].argsort()]
# 第三學期學生健康飲食記錄
health_eat_consumption_3rd_times = counter_health_eating_consumption(consumption_3rd)
# 第三學期學生消費總額
consumption_3rd_times = counter__consumption(consumption_3rd)
# 第一學期學生綜合排名記錄
score_1st_times = score_1st[score_1st[:, 0].argsort()]
# 第二學期學生綜合排名記錄
score_2nd_times = score_2nd[score_1st[:, 0].argsort()]





def load_student_data():
    student_data = []
    for x in np.arange(538):
        student_data.append(
            [borrow_3rd_time[x][1], access_3rd_times[x][1], consumption_3rd_times[x][1],
             health_eat_consumption_3rd_times[x][1], score_1st_times[x][1],
             score_2nd_times[x][1]])
    return preprocessing.scale(np.array(student_data))


def load_label():
    return score_3rd_times[:, 1]


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章