機器學習實戰代碼_Python3.6_迴歸

# numpy庫下的幾個小函數的用法
# 1、mat函數
#
# mat函數可以將目標數據的類型轉換爲矩陣(matrix)
#
# 2、zeros
#
# zeros函數是生成指定維數的全0數組
#
# 3、ones
#
# ones函數是用於生成一個全1的數組
#
# 4.eye
#
# eye函數用戶生成指定行數的單位矩陣
#
# 5、.T
#
# .T作用於矩陣,用作球矩陣的轉置
#
# 6、tolist
#
# tolist函數用於把一個矩陣轉化成爲list列表
#
# 7.getA()
#
# getA()函數是numpy.matrix下的一個函數,用作把矩陣轉換成數組,等價於np.asarray(self).
#
# 8. .I
#
# .I用作求矩陣的逆矩陣。逆矩陣在計算中是經常需要用到的。例如一個矩陣A,求A的逆矩陣B,即存在矩陣B是的AB=I(I爲單位)


from numpy import *
import matplotlib.pyplot as plt

def load_data_set(file_name):
	num_feat = len(open(file_name).readline().split('\t')) - 1
	data_mat = []
	label_mat = []
	fr = open(file_name)
	for line in fr.readlines():
		line_arr = []
		cue_line = line.strip().split('\t')
		for i in range(num_feat):
			line_arr.append(float(cue_line[i]))
		data_mat.append(line_arr)
		label_mat.append(float(cue_line[-1])) #默認文件最後一行是目標值
	return data_mat, label_mat


def stand_regress(x_arr, y_arr):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr)
	xTx = x_mat.T * x_mat
	if linalg.det(xTx) == 0.0:  #判斷矩陣是否可以求逆矩陣
		print('this matrix is singular, cannot do inverse')
		return
	#print(shape(xTx.I), shape(x_mat.T), shape(y_mat))
	#print(x_mat.T * y_mat.T)
	ws = xTx.I * (x_mat.T * y_mat.T)    #書上代碼有點小問題,y_mat也要調用.T方法
	return ws

def lwlr(test_point, x_arr, y_arr, k=1.0 ):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr)
	m = shape(x_mat)[0]
	weights = mat(eye(m)) #創建對角單位矩陣
	for j in range(m):
		diff_mat = test_point - x_mat[j, :]
		weights[j, j] = exp(diff_mat*diff_mat.T/(-2.0*k**2))    #權重大小以及指數及衰減
	xTx = x_mat.T * (weights*x_mat)
	if linalg.det(xTx) == 0.0:
		print('this matrix is singular, cannot do inverse')
		return
	ws = xTx.I * ( x_mat.T * (weights*y_mat.T))
	return  test_point*ws

def lwlr_test(test_arr, x_arr, y_arr, k=1.0):
	m = shape(test_arr)[0]
	y_hat = zeros(m)
	for i in range(m):
		y_hat[i] = lwlr(test_arr[i], x_arr, y_arr, k)
	return y_hat

# if __name__ == '__main__':
#     x_arr, y_arr = load_data_set('ex0.txt')
#     # print(x_arr[0:2])
#     # ws = stand_regress(x_arr, y_arr)
#     # print(ws)
#     # x_mat = mat(x_arr)
#     # y_mat = mat(y_arr)
#     # y_hat = x_mat * ws
#     # fig = plt.figure()
#     # ax = fig.add_subplot(111)
#     # ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T[:, 0].flatten().A[0])
#     # plt.show()
#     #
#     # x_copy = x_mat.copy()
#     # x_copy.sort(0)
#     # y_hat = x_copy * ws
#     # ax.plot(x_copy[:, 1], y_hat)
#     k = float(0.0015)
#     while k < 0.05:
# 	    y_hat = lwlr_test(x_arr, x_arr, y_arr, k)
# 	    x_mat = mat(x_arr)
# 	    y_mat = mat(y_arr)
# 	    srt_ind = x_mat[:, 1].argsort(0)
# 	    x_sort = x_mat[srt_ind][:, 0, :]
# 	    fig = plt.figure()
# 	    ax = fig.add_subplot(111)
# 	    ax.plot(x_sort[:,1], y_hat[srt_ind])
# 	    ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T.flatten().A[0], s=2, c='red')
# 	    plt.title('K = '+str(k)[0:6])
# 	    plt.savefig('E:\Li_Python\Regression\k_png\k_is_'+str(k)[0:6].replace('.', '_')+'.png')
# 	    plt.show()
# 	    k += 0.0005

def rss_error(y_arr, y_hat_arr):
	return ((y_arr-y_hat_arr)**2).sum()


# if __name__ == '__main__':
#     abx, aby = load_data_set('abalone.txt')
#     y_hat01 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 0.1)
#     y_hat1 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 1)
#     y_hat10 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 10)
#     print('k=0.1 : ', rss_error(aby[0:99], y_hat01.T))
#     print('k=1 : ', rss_error(aby[0:99], y_hat1.T))
#     print('k=10 : ', rss_error(aby[0:99], y_hat10.T))
#
#     y_hat01 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 0.1)
#     y_hat1 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 1)
#     y_hat10 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 10)
#     print('k=0.1 : ', rss_error(aby[100:199], y_hat01.T))
#     print('k=1 : ', rss_error(aby[100:199], y_hat1.T))
#     print('k=10 : ', rss_error(aby[100:199], y_hat10.T))

def ridge_regress(x_mat, y_mat, lam=0.2):
	xTx = x_mat.T * x_mat
	denom = xTx + eye(shape(x_mat)[1])*lam
	if linalg.det(denom) == 0.0:
		print('this matrix is singular, cannot do inverse')
		return
	ws = denom.I * (x_mat.T * y_mat)
	return ws

def ridge_test(x_arr, y_arr):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr).T
	y_mean = mean(y_mat, 0) #求取均值,具體mean函數參考手冊
	y_mat = y_mat - y_mean
	x_mean = mean(x_mat, 0)
	x_var = var(x_mat, 0)
	x_mat = (x_mat-x_mean)/x_var
	num_test_pts = 30
	w_mat = zeros((num_test_pts, shape(x_mat)[1]))
	for i in range(num_test_pts):
		ws = ridge_regress(x_mat, y_mat, exp(i-10))
		w_mat[i,:] = ws.T
	return w_mat
	
	
# if __name__ == '__main__':
#     abx, aby = load_data_set('abalone.txt')
#     ridge_weights = ridge_test(abx, aby)
#     fig = plt.figure()
#     ax = fig.add_subplot(111)
#     ax.plot(ridge_weights)
#     plt.savefig('log_lambda.png')
#     plt.show()

#

def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat


def stage_wise(x_arr, y_arr, eps=0.1, num_it=100):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr).T    #這裏需要使用.T方法對矩陣進行轉置 Array property returning the array transposed.
	y_mean = mean(y_mat, 0)
	y_mat = y_mat - y_mean
	x_mat = regularize(x_mat)
	m, n = shape(x_mat)
	return_mat = zeros((num_it, n))
	ws = zeros((n,1))
	ws_test = ws.copy()
	ws_max = ws.copy()
	for i in range(num_it):
		print(ws.T)
		lower_error = inf
		for j in range(n):
			for sign in [-1, 1]:
				ws_test = ws.copy()
				ws_test[j] += eps*sign
				y_test = x_mat * ws_test
				rsse = rss_error(y_mat.A, y_test.A)
				if rsse < lower_error:
					lower_error = rsse
					ws_max = ws_test
		ws = ws_max.copy()
		return_mat[i, :] =  ws.T
	return return_mat

if __name__ == '__main__':
	x_arr, y_arr = load_data_set('abalone.txt')
	print(stage_wise(x_arr, y_arr, 0.01, 200))
	print('-'*50)
	print(stage_wise(x_arr, y_arr, 0.001, 5000))
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.plot(stage_wise(x_arr, y_arr, 0.001, 5000))
	plt.show()

###前向逐步迴歸(屬於一種貪心算法)
僞代碼:

數據標準化,使其分佈滿足0均值和單位方差
在每一輪的迭代過程中:
    設置當前最小誤差lowest_error爲正無窮(inf)
    對每一個特徵:
        增大或者減小:
            改變一個係數得到一個新的w
            計算新的w下的誤差
            如果誤差error小於當前最小誤差lowest_error:
                設置w_best等於當前的w
        將w設置爲新的w_best

ndarray.mean([axis, dtype, out, keepdims])
Returns the average of the array elements along given axis


逐步線性迴歸算法主要優點在於其可以幫助理解現有的模型並作出改進,
當構建一個模型之後可以運行該算法來找出重要的特徵,這樣就有可能及時停止
對不重要特徵的收集


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章