# numpy庫下的幾個小函數的用法
# 1、mat函數
#
# mat函數可以將目標數據的類型轉換爲矩陣(matrix)
#
# 2、zeros
#
# zeros函數是生成指定維數的全0數組
#
# 3、ones
#
# ones函數是用於生成一個全1的數組
#
# 4.eye
#
# eye函數用戶生成指定行數的單位矩陣
#
# 5、.T
#
# .T作用於矩陣,用作球矩陣的轉置
#
# 6、tolist
#
# tolist函數用於把一個矩陣轉化成爲list列表
#
# 7.getA()
#
# getA()函數是numpy.matrix下的一個函數,用作把矩陣轉換成數組,等價於np.asarray(self).
#
# 8. .I
#
# .I用作求矩陣的逆矩陣。逆矩陣在計算中是經常需要用到的。例如一個矩陣A,求A的逆矩陣B,即存在矩陣B是的AB=I(I爲單位)
from numpy import *
import matplotlib.pyplot as plt
def load_data_set(file_name):
num_feat = len(open(file_name).readline().split('\t')) - 1
data_mat = []
label_mat = []
fr = open(file_name)
for line in fr.readlines():
line_arr = []
cue_line = line.strip().split('\t')
for i in range(num_feat):
line_arr.append(float(cue_line[i]))
data_mat.append(line_arr)
label_mat.append(float(cue_line[-1])) #默認文件最後一行是目標值
return data_mat, label_mat
def stand_regress(x_arr, y_arr):
x_mat = mat(x_arr)
y_mat = mat(y_arr)
xTx = x_mat.T * x_mat
if linalg.det(xTx) == 0.0: #判斷矩陣是否可以求逆矩陣
print('this matrix is singular, cannot do inverse')
return
#print(shape(xTx.I), shape(x_mat.T), shape(y_mat))
#print(x_mat.T * y_mat.T)
ws = xTx.I * (x_mat.T * y_mat.T) #書上代碼有點小問題,y_mat也要調用.T方法
return ws
def lwlr(test_point, x_arr, y_arr, k=1.0 ):
x_mat = mat(x_arr)
y_mat = mat(y_arr)
m = shape(x_mat)[0]
weights = mat(eye(m)) #創建對角單位矩陣
for j in range(m):
diff_mat = test_point - x_mat[j, :]
weights[j, j] = exp(diff_mat*diff_mat.T/(-2.0*k**2)) #權重大小以及指數及衰減
xTx = x_mat.T * (weights*x_mat)
if linalg.det(xTx) == 0.0:
print('this matrix is singular, cannot do inverse')
return
ws = xTx.I * ( x_mat.T * (weights*y_mat.T))
return test_point*ws
def lwlr_test(test_arr, x_arr, y_arr, k=1.0):
m = shape(test_arr)[0]
y_hat = zeros(m)
for i in range(m):
y_hat[i] = lwlr(test_arr[i], x_arr, y_arr, k)
return y_hat
# if __name__ == '__main__':
# x_arr, y_arr = load_data_set('ex0.txt')
# # print(x_arr[0:2])
# # ws = stand_regress(x_arr, y_arr)
# # print(ws)
# # x_mat = mat(x_arr)
# # y_mat = mat(y_arr)
# # y_hat = x_mat * ws
# # fig = plt.figure()
# # ax = fig.add_subplot(111)
# # ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T[:, 0].flatten().A[0])
# # plt.show()
# #
# # x_copy = x_mat.copy()
# # x_copy.sort(0)
# # y_hat = x_copy * ws
# # ax.plot(x_copy[:, 1], y_hat)
# k = float(0.0015)
# while k < 0.05:
# y_hat = lwlr_test(x_arr, x_arr, y_arr, k)
# x_mat = mat(x_arr)
# y_mat = mat(y_arr)
# srt_ind = x_mat[:, 1].argsort(0)
# x_sort = x_mat[srt_ind][:, 0, :]
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(x_sort[:,1], y_hat[srt_ind])
# ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T.flatten().A[0], s=2, c='red')
# plt.title('K = '+str(k)[0:6])
# plt.savefig('E:\Li_Python\Regression\k_png\k_is_'+str(k)[0:6].replace('.', '_')+'.png')
# plt.show()
# k += 0.0005
def rss_error(y_arr, y_hat_arr):
return ((y_arr-y_hat_arr)**2).sum()
# if __name__ == '__main__':
# abx, aby = load_data_set('abalone.txt')
# y_hat01 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 0.1)
# y_hat1 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 1)
# y_hat10 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 10)
# print('k=0.1 : ', rss_error(aby[0:99], y_hat01.T))
# print('k=1 : ', rss_error(aby[0:99], y_hat1.T))
# print('k=10 : ', rss_error(aby[0:99], y_hat10.T))
#
# y_hat01 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 0.1)
# y_hat1 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 1)
# y_hat10 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 10)
# print('k=0.1 : ', rss_error(aby[100:199], y_hat01.T))
# print('k=1 : ', rss_error(aby[100:199], y_hat1.T))
# print('k=10 : ', rss_error(aby[100:199], y_hat10.T))
def ridge_regress(x_mat, y_mat, lam=0.2):
xTx = x_mat.T * x_mat
denom = xTx + eye(shape(x_mat)[1])*lam
if linalg.det(denom) == 0.0:
print('this matrix is singular, cannot do inverse')
return
ws = denom.I * (x_mat.T * y_mat)
return ws
def ridge_test(x_arr, y_arr):
x_mat = mat(x_arr)
y_mat = mat(y_arr).T
y_mean = mean(y_mat, 0) #求取均值,具體mean函數參考手冊
y_mat = y_mat - y_mean
x_mean = mean(x_mat, 0)
x_var = var(x_mat, 0)
x_mat = (x_mat-x_mean)/x_var
num_test_pts = 30
w_mat = zeros((num_test_pts, shape(x_mat)[1]))
for i in range(num_test_pts):
ws = ridge_regress(x_mat, y_mat, exp(i-10))
w_mat[i,:] = ws.T
return w_mat
# if __name__ == '__main__':
# abx, aby = load_data_set('abalone.txt')
# ridge_weights = ridge_test(abx, aby)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(ridge_weights)
# plt.savefig('log_lambda.png')
# plt.show()
#
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
def stage_wise(x_arr, y_arr, eps=0.1, num_it=100):
x_mat = mat(x_arr)
y_mat = mat(y_arr).T #這裏需要使用.T方法對矩陣進行轉置 Array property returning the array transposed.
y_mean = mean(y_mat, 0)
y_mat = y_mat - y_mean
x_mat = regularize(x_mat)
m, n = shape(x_mat)
return_mat = zeros((num_it, n))
ws = zeros((n,1))
ws_test = ws.copy()
ws_max = ws.copy()
for i in range(num_it):
print(ws.T)
lower_error = inf
for j in range(n):
for sign in [-1, 1]:
ws_test = ws.copy()
ws_test[j] += eps*sign
y_test = x_mat * ws_test
rsse = rss_error(y_mat.A, y_test.A)
if rsse < lower_error:
lower_error = rsse
ws_max = ws_test
ws = ws_max.copy()
return_mat[i, :] = ws.T
return return_mat
if __name__ == '__main__':
x_arr, y_arr = load_data_set('abalone.txt')
print(stage_wise(x_arr, y_arr, 0.01, 200))
print('-'*50)
print(stage_wise(x_arr, y_arr, 0.001, 5000))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(stage_wise(x_arr, y_arr, 0.001, 5000))
plt.show()
###前向逐步迴歸(屬於一種貪心算法)
僞代碼:
數據標準化,使其分佈滿足0均值和單位方差
在每一輪的迭代過程中:
設置當前最小誤差lowest_error爲正無窮(inf)
對每一個特徵:
增大或者減小:
改變一個係數得到一個新的w
計算新的w下的誤差
如果誤差error小於當前最小誤差lowest_error:
設置w_best等於當前的w
將w設置爲新的w_best
ndarray.mean([axis, dtype, out, keepdims])
Returns the average of the array elements along given axis
逐步線性迴歸算法主要優點在於其可以幫助理解現有的模型並作出改進,
當構建一個模型之後可以運行該算法來找出重要的特徵,這樣就有可能及時停止
對不重要特徵的收集