總算是面試完了,寫這篇博客還沒出結果...
拿到了比較全的數據集,該數據集裏包含了學生的圖書館門禁、借書和消費記錄,以此來預測成績。
做了一下數據的處理:
import numpy as np
import sys
from sklearn import preprocessing
def split_period(file_name):
first_period = []
second_period = []
third_period = []
raw = np.genfromtxt(file_name)[1:]
for x in range(len(raw)):
if raw[x][0] == 1:
first_period.append(raw[x][1:])
if raw[x][0] == 2:
second_period.append(raw[x][1:])
if raw[x][0] == 3:
third_period.append(raw[x][1:])
first_period = np.array(first_period)
second_period = np.array(second_period)
third_period = np.array(third_period)
return first_period, second_period, third_period
def split_term_consumption(file_name):
first_period = []
second_period = []
third_period = []
data = np.genfromtxt(file_name, dtype=str)[1:]
for index in data:
if index[0] == '1':
# 學號 地點 日期 時間 金額
first_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
elif index[0] == '2':
second_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
elif index[0] == '3':
third_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
return first_period, second_period, third_period
def split_term_consumption_npy(file_name):
first_period = []
second_period = []
third_period = []
data = np.load(file_name)[1:]
for index in data:
if index[0] == '1':
# 學號 地點 日期 時間 金額
first_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
elif index[0] == '2':
second_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
elif index[0] == '3':
third_period.append([int(index[1]), str(index[2]), str(index[3]), int(index[4]), float(index[5])])
# first_period = np.array(first_period)
# second_period = np.array(second_period)
# third_period = np.array(third_period)
return first_period, second_period, third_period
def counter(larray):
counters = []
for x in np.arange(1, 539, 1):
y = larray == x
z = larray[y]
counters.append((x, z.size))
return np.array(counters)
def counter_health_eating_consumption(consumption):
money_counters = []
for x in np.arange(1, 539, 1):
money_counters.append([x, 0])
for index in consumption:
# print("student id :",int(index[0]))
# print("money_counters[int(index[0])]",money_counters[int(index[0])-1])
if index[1] == '\xca\xb3\xcc\xc3':
# 三餐均規律
if index[3] > 63000 < 83000:
# 三餐按時喫的金額
# money_counters[int(index[0]) - 1][1] += index[4]
# 三餐按時喫的頻次
money_counters[int(index[0]) - 1][1] += 1
return np.array(money_counters)
def counter__consumption(consumption):
money_counters = []
for x in np.arange(1, 539, 1):
money_counters.append([x, 0])
for index in consumption:
# print("student id :",int(index[0]))
# print("money_counters[int(index[0])]",money_counters[int(index[0])-1])
money_counters[int(index[0]) - 1][1] += index[4]
return np.array(money_counters)
def catalog2list():
catalog = []
if sys.version > '3':
book_catalog = np.load('./data/catalog.npy')
for index in range(len(book_catalog)):
catalog.append([book_catalog[index][0].decode(), book_catalog[index][1].decode()])
else:
catalog = np.load('./data/catalog.npy')
return catalog
def counter_book(book_list):
book = dict()
for x in book_list:
if book.get(x[1]) is not None:
book[x[1]] += 1
else:
book[x[1]] = 1
return book
score_1st, score_2nd, score_3rd = split_period('./data/score.txt')
access_1st, access_2nd, access_3rd = split_period('./data/access.txt')
consumption_1st, consumption_2nd, consumption_3rd = split_term_consumption_npy('./data/consumption.npy')
borrow_1st, borrow_2nd, borrow_3rd = split_period('./data/borrow.txt')
# 處理圖書分類
lib_list = catalog2list()
lib_dict = dict(lib_list)
# print(lib_list)
# print("book status")
# # 不同書類別的數量
# book_dict = counter_book(lib_list)
# book_label = book_dict.keys()
# print(book_dict)
# print("book_label")
# # 書類別標籤
# print(book_label)
print("")
# 第三學期學生借書頻次記錄
borrow_3rd_time = counter(np.sort(borrow_3rd[:, 0]))
# 第三學期學生進入圖書館頻次記錄
access_3rd_times = counter(np.sort(access_3rd[:, 0]))
# 第三學期學生綜合排名記錄
score_3rd_times = score_3rd[score_3rd[:, 0].argsort()]
# 第三學期學生健康飲食記錄
health_eat_consumption_3rd_times = counter_health_eating_consumption(consumption_3rd)
# 第三學期學生消費總額
consumption_3rd_times = counter__consumption(consumption_3rd)
# 第一學期學生綜合排名記錄
score_1st_times = score_1st[score_1st[:, 0].argsort()]
# 第二學期學生綜合排名記錄
score_2nd_times = score_2nd[score_1st[:, 0].argsort()]
def load_student_data():
student_data = []
for x in np.arange(538):
student_data.append(
[borrow_3rd_time[x][1], access_3rd_times[x][1], consumption_3rd_times[x][1],
health_eat_consumption_3rd_times[x][1], score_1st_times[x][1],
score_2nd_times[x][1]])
return preprocessing.scale(np.array(student_data))
def load_label():
return score_3rd_times[:, 1]