斯坦福大學機器學習作業題Problem Set #1 Regression for denoising quasar spectra 下篇

原創

2020-02-21 14:47

（i）處理文件

# -*- coding: utf-8 -*-
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
def read():
    fr=open('quasar_test.csv','r')
    arrayline=fr.readlines()
    y=arrayline[1].strip().split(',')
    m=len(arrayline)
    n=len(y)
    y2 = np.zeros((m - 1, n))
    x = np.zeros((2, n))
    x[0] = arrayline[0].strip().split(',')
    x[1] = 1
    y=map(lambda y: float(y), y)
    y2[0] = weighted_linear_regression(x, y, 5)
    q=[]
    for i in range(len(y2[0])):
        q.append(y2[0][i])
    f1 = open('result1.csv', 'w')
    f1.write(str(q))
    f1.write('\n')
    i=1
    for line in arrayline[2:]:
        y = line.strip().split(',')
        y = map(lambda y: float(y), y)
        y2[i]=weighted_linear_regression(x, y, 5)
        q = []
        for j in range(len(y2[0])):
            q.append(y2[i][j])
        print q
        i=i+1
        f1.write(str(q))
        f1.write('\n')
    return y2

def weighted_linear_regression(x,y,t):#加權線性迴歸
    y2=[]
    for i in range(len(x[0])):
        w=np.zeros((len(x[0]),len(x[0])))
        for j in range(len(x[0])):
            w[j][j]=math.exp((x[0][i] - x[0][j])*(x[0][i] - x[0][j]) / (-2 * t * t))
        xwx=np.dot(np.dot(x,w),np.transpose(x))
        xwx_inverse=np.linalg.inv(xwx)
        xwx_inverse_x=np.dot(xwx_inverse,x)
        xwx_inverse_x_w = np.dot(xwx_inverse_x,w)
        xwx_inverse_x_w_y=np.dot(xwx_inverse_x_w,np.transpose(y))
        theta=xwx_inverse_x_w_y
        y2.append(x[0][i] * theta[0] + theta[1])
    return y2
y2=read()

（ii）

# -*- coding: utf-8 -*-
import numpy as np
import heapq
import csv
def read():
    fr=open('result.csv')
    arrayline = fr.readlines()
    x = arrayline[0].strip( ).split(',')
    x=map(lambda x: float(x), x)
    m=len(arrayline)
    n=len(x)
    y=np.zeros((m-1,n))
    i=0
    for line in arrayline[1:]:
        num=line.strip().split(',')
        num = map(lambda num: float(num), num)
        y[i]=num
        i=i+1
    return x,y
x,y=read()
def predict(x,y,m):
    dis=[]
    for i in range(len(y)):
        sum = 0
        for j in range(150,450):
            sum=sum+(y[m][j]-y[i][j])*(y[m][j]-y[i][j])
        dis.append(sum)
    smallest_four=heapq.nsmallest(4, dis)
    smallest_three=smallest_four[1:]
    location=[]
    for i in range(len(smallest_three)):
        location.append(dis.index(smallest_three[i]))
    h=max(smallest_three)
    f_left=[]
    for i in range(50):
        top=0
        bottom=0
        for j in range(len(smallest_three)):
            n=location[j]
            top = top + (1 - smallest_three[j]/ h) * y[n][i]
            bottom = bottom + (1 -smallest_three[j]/ h)
        sum=top / bottom
        f_left.append(sum)
    return f_left

def error(y):
    f_left_predict=[]
    sum_error=[]
    for m in range(len(y)):
        error=0
        f_left=predict(x, y, m)
        for n in range(len(f_left)):
            error=error+(f_left[n]-y[m][n])*(f_left[n]-y[m][n])
        f_left_predict.append(f_left)
        sum_error.append(error)
    print sum(sum_error)/len(sum_error)
error(y)

（iii）

# -*- coding: utf-8 -*-
import numpy as np
import heapq
import matplotlib.pyplot as plt
def read1():       #讀取數據
    fr=open('result.csv')
    arrayline = fr.readlines()
    x = arrayline[0].strip( ).split(',')
    x=map(lambda x: float(x), x)
    m=len(arrayline)
    n=len(x)
    y=np.zeros((m-1,n))
    i=0
    for line in arrayline[1:]:
        num=line.strip().split(',')
        num = map(lambda num: float(num), num)
        y[i]=num
        i=i+1
    return x,y
def read2():       #讀取預測數據
    fr=open('result1.csv')
    arrayline = fr.readlines()
    x= arrayline[0].strip( ).split(',')
    x=map(lambda x: float(x), x)
    m=len(arrayline)
    n=len(x)
    y=np.zeros((m-1,n))
    i=0
    for line in arrayline[1:]:
        num=line.strip().split(',')
        num = map(lambda num: float(num), num)
        y[i]=num
        i=i+1
    return x,y

def predict(x,y,y1,m):  #y1爲預測數據
    dis=[]
    for i in range(len(y)):
        sum = 0
        for j in range(150,450):
            sum=sum+(y1[m][j]-y[i][j])*(y1[m][j]-y[i][j])
        dis.append(sum)
    smallest_three=heapq.nsmallest(3, dis)
    location=[]
    for i in range(len(smallest_three)):
        location.append(dis.index(smallest_three[i]))
    h=max(smallest_three)
    f_left=[]
    for i in range(50):
        top=0
        bottom=0
        for j in range(len(smallest_three)):
            n=location[j]
            top = top + (1 - smallest_three[j]/ h) * y[n][i]
            bottom = bottom + (1 -smallest_three[j]/ h)
        sum=top / bottom
        f_left.append(sum)
    return f_left

def error(x,y,y1):
    f_left_predict=[]
    sum_error=[]
    for m in range(len(y1)):
        f_left=predict(x,y,y1,m)
        error = 0
        for n in range(len(f_left)):
            error=error+(f_left[n]-y1[m][n])*(f_left[n]-y1[m][n])
        f_left_predict.append(f_left)
        sum_error.append(error)
    return f_left_predict[0],f_left_predict[5],sum(sum_error)/len(sum_error)

def figure(x,example_1,example_6,y1):        #畫圖
    plt.figure(1)
    plt.xlabel('Wavelength')
    plt.ylabel('Flux')
    plt.scatter(x[0:50],example_1, marker='.', color='b', label='predict value', s=10)
    plt.scatter(x[0:50], y1[0][0:50], marker='.', color='g', label='real value', s=10)
    plt.legend(loc='upper right')
    plt.figure(2)
    plt.xlabel('Wavelength')
    plt.ylabel('Flux')
    plt.scatter(x[0:50], example_6, marker='.', color='b', label='predict value', s=10)
    plt.scatter(x[0:50], y1[5][0:50], marker='.', color='g', label='real value', s=10)
    plt.legend(loc='upper right')
    plt.show()

def main():
    x,y=read1()
    x1,y1=read2()
    example_1, example_6, error1 = error(x,y, y1)
    figure(x,example_1,example_6,y1)

if __name__ == '__main__':
    main()