01.k-近鄰算法(KNN)

參考:
機器學習實戰教程
機器學習實戰書籍下載 - - 密碼:qi7q
k-近鄰算法
公式編輯器
drawio畫流程圖
百度思維導圖


紙上得來終覺淺,絕知此事要躬行

在這裏插入圖片描述


1.算法理論

在這裏插入圖片描述
k-近鄰算法 是一種特徵搜索的方法(相似性搜索):

  • 1、準備訓練集D={(x1,y1),(x2,y2),(x3,y3),...(xn,yn)}D=\{(x_1,y_1),(x_2,y_2),(x_3,y_3),...(x_n,y_n)\}

    • 數據集量化 (將文本數據轉成數字)
    • 特徵做歸一化 (訓練集與測試集都做)
  • 2、每個測試樣本與訓練集中所有樣本計算距離,按距離排序查找k個樣本

    • 可以使用的距離度量方法有:
      • 歐式距離(常用)
      • 閔可夫斯基距離
      • 曼哈頓距離
      • 切比雪夫距離
      • 馬哈拉洛比斯距離
      • 相似度度量
      • 向量空間餘弦相似度
      • 皮爾森相關係數
  • 3、統計這k個樣本

    • 分類
      • 投票機制(少數服從多數)
    • 迴歸
      • 距離加權平均(類似於插值方式,距離越小權重越大

2.實踐

1.knn分類

"""
Author:wucng
Time:  20200107
Summary: 使用KNN(K-近領域)對iris數據分類
數據下載:https://archive.ics.uci.edu/ml/datasets.php
源代碼: https://github.com/wucng/MLAndDL
"""
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.metrics import accuracy_score,auc
import pandas as pd
import numpy as np
import os
import time


# 1.加載數據集(並做預處理)
def loadData(dataPath: str) -> tuple:
    # 如果有標題可以省略header,names ;sep 爲數據分割符
    df = pd.read_csv(dataPath, sep=",", header=-1,
                     names=["sepal_length", "sepal_width", "petal_length", "petal_width", "label"])
    # 填充缺失值
    df = df.fillna(0)
    # 數據量化
    # 文本量化
    df.replace("Iris-setosa", 0, inplace=True)
    df.replace("Iris-versicolor", 1, inplace=True)
    df.replace("Iris-virginica", 2, inplace=True)

    # 劃分出特徵數據與標籤數據
    X = df.drop("label", axis=1)  # 特徵數據
    y = df.label  # or df["label"] # 標籤數據

    # 數據歸一化
    X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))

    # 使用sklearn方式
    # X = MinMaxScaler().transform(X)

    # 查看df信息
    # df.info()
    # df.describe()
    return (X.to_numpy(), y.to_numpy())

class KNN(object):
    """默認使用歐式距離"""
    def __init__(self,X_train:np.asarray,X_test:np.asarray,
                 y_train:np.asarray,y_test:np.asarray=None,k:int=5):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.k = k

        self.__calDistance()

    # 2.計算每個測試樣本到訓練樣本的距離
    def __calDistance(self):
        result_dist = np.zeros([len(self.X_test),len(self.X_train)])
        for i,data in enumerate(self.X_test):
            data = np.tile(data,(len(self.X_train),1))
            distance = np.sqrt(np.sum((data-self.X_train)**2,-1))
            # result_dist[i] = sorted(distance) # 從小到大排序,先不排序,否則索引位置發生變化與標籤對應不上
            result_dist[i] = distance

        self.result_dist = result_dist
        # return result_dist

    def __calDistance2(self):
        result_dist = np.zeros([len(self.X_test), len(self.X_train)])
        for i, data_test in enumerate(self.X_test):
            dist = np.zeros((len(self.X_train),))
            for j,data in enumerate(self.X_train):
                dist[j] = (sum((data_test-data)**2))**0.5

            result_dist[i] = dist

        self.result_dist = result_dist
        # return result_dist

    # 3.根據距離確定類別
    def predict(self):
        """k:爲選取的最近樣本點個數"""
        # 距離從小到大排序獲取索引
        result_index = np.argsort(self.result_dist,-1)[:,:self.k]

        # 將索引替換成對應的標籤
        y_pred = self.y_train[result_index]

        # 統計每列次數出現最多對應的值即爲預測標籤
        y_pred = [np.bincount(pred).argmax() for pred in y_pred]
        self.y_pred = np.asarray(y_pred)

        return self.y_pred

    # 4.計算精度信息
    def accuracy(self):
        assert self.y_test is not None,print("error")
        assert len(self.y_pred)==len(self.y_test),print("error")
        return np.sum(self.y_pred==self.y_test)/len(self.y_test)

if __name__ =="__main__":
    dataPath = "../../dataset/iris.data"
    X,y = loadData(dataPath)
    # print(X.shape,y.shape) # (150, 4) (150,)

    # 劃分訓練集與測試集
    X_train, X_test, y_train, y_test = train_test_split(
                                            X, y, test_size = 0.2, random_state = 42)

    start = time.time()
    clf = KNN(X_train, X_test, y_train, y_test,3)
    y_pred = clf.predict()
    # print(y_pred)

    print("cost time:%.6f(s) acc:%.3f"%(time.time()-start,clf.accuracy()))
    # cost time:0.001994(s) acc:1.000

    # ----------------------------------------------------------------------
    # 使用sklearn的KNeighborsClassifier方法
    start = time.time()
    clf = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree').fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    print("cost time:%.6f(s) error:%.3f" % (time.time() - start, acc))
    # cost time:0.001994(s) error:1.000

2.knn迴歸

"""
Author:wucng
Time:  20200108
Summary: 使用KNN(K-近領域)對boston房價做迴歸預測
數據下載:https://archive.ics.uci.edu/ml/datasets.php
源代碼: https://github.com/wucng/MLAndDL
"""
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.metrics import accuracy_score,auc
import pandas as pd
import numpy as np
import os
import time


# 1.加載數據集(並做預處理)
def loadData(dataPath: str) -> tuple:
    with open(dataPath,"r") as fp:
        lines = fp.readlines()
        dataset=[]
        i = 0
        while i<len(lines):
            line = lines[i]
            i += 1
            if line[0].isdigit(): # 數字開頭
                data1=list(map(float,line.strip().split(" ")))
                line = lines[i]
                i += 1
                data2 = list(map(float, line.strip().split(" ")))
                data1.extend(data2)
                dataset.append(data1)
            else:
                continue

        dataset = np.asarray(dataset)

        # 拆分成訓練集與標籤
        X,y = dataset[...,:-1],dataset[...,-1]

        # 數據歸一化
        X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))

        # 使用sklearn方式
        # X = MinMaxScaler().transform(X)

    return (X,y)


class KNN(object):
    """默認使用歐式距離"""
    def __init__(self,X_train:np.asarray,X_test:np.asarray,
                 y_train:np.asarray,y_test:np.asarray=None,k:int=5):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.k = k

        self.__calDistance()

    # 2.計算每個測試樣本到訓練樣本的距離
    def __calDistance(self):
        result_dist = np.zeros([len(self.X_test),len(self.X_train)])
        for i,data in enumerate(self.X_test):
            data = np.tile(data,(len(self.X_train),1))
            distance = np.sqrt(np.sum((data-self.X_train)**2,-1))
            # result_dist[i] = sorted(distance) # 從小到大排序,先不排序,否則索引位置發生變化與標籤對應不上
            result_dist[i] = distance

        self.result_dist = result_dist
        # return result_dist

    def __calDistance2(self):
        result_dist = np.zeros([len(self.X_test), len(self.X_train)])
        for i, data_test in enumerate(self.X_test):
            dist = np.zeros((len(self.X_train),))
            for j,data in enumerate(self.X_train):
                dist[j] = (sum((data_test-data)**2))**0.5

            result_dist[i] = dist

        self.result_dist = result_dist
        # return result_dist

    # 3.根據距離確定類別
    def predict(self):
        """k:爲選取的最近樣本點個數"""
        # 距離從小到大排序獲取索引
        result_index = np.argsort(self.result_dist,-1)[:,:self.k]

        # 將索引替換成對應的標籤
        y_pred = self.y_train[result_index]

        # 做距離加權平均得到預測值
        # 獲取對應的距離值
        dists = self.result_dist.copy()
        dists.sort(-1)
        dists =dists[...,:self.k]
        # 根據距離做距離加權平均(距離越近權重越大)
        preds = []
        for pred,dist in zip(y_pred,dists):
            dist = np.exp(dist*(-1)) # 使用 e^(-x)
            preds.append(np.sum(pred*dist/np.sum(dist)))

            # 假設滿足正太分佈 f(x) = 1/(sqrt(2*pi)*sigma)*e^(-(x-mu)^2/(2*sigma^2))
            # 取標準正太分佈
            # dist = 1/(np.sqrt(2*np.pi))*np.exp(-(dist)**2/2)
            # preds.append(np.sum(pred * dist / np.sum(dist)))

        self.y_pred = np.asarray(preds)

        return self.y_pred

    # 4.計算精度信息
    def error(self):
        assert self.y_test is not None,print("error")
        assert len(self.y_pred)==len(self.y_test),print("error")
        return np.sum((self.y_pred-self.y_test)**2)/len(self.y_test)

if __name__ =="__main__":
    dataPath = "../../dataset/boston.txt"
    X,y = loadData(dataPath)
    print(X.shape,y.shape) # (506, 13) (506,)

    # 劃分訓練集與測試集
    X_train, X_test, y_train, y_test = train_test_split(
                                            X, y, test_size = 0.2, random_state = 42)

    start = time.time()
    clf = KNN(X_train, X_test, y_train, y_test,3)
    y_pred = clf.predict()
    # print(y_pred)

    print("cost time:%.6f(s) error:%.3f"%(time.time()-start,clf.error()))
    # cost time:0.012968(s) error:19.994

    # ----------------------------------------------------------------------
    # 使用sklearn的KNeighborsRegressor方法
    start = time.time()
    clf = KNeighborsRegressor(n_neighbors=3,weights='distance',algorithm='kd_tree').fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    error = np.sum((y_pred-y_test)**2)/len(y_test)
    print("cost time:%.6f(s) error:%.3f"%(time.time()-start,error))
    # cost time:0.004056(s) error:18.905

總結

  • 1、KNN分類,如果按距離選出的K個樣本中,有一個被噪聲污染(類別不對),但是其他K-1是沒問題,還是能保證正確類別大於錯誤類別,因此能夠正確分類(KNN分類對噪聲點不太很敏感

  • 2、KNN迴歸,如果按距離選出的K個樣本中,有一個被噪聲污染(其對應的輸出值偏差很大),而且這個噪聲離我們要預測的樣本距離又很近,那麼會造成其佔的比重很大,導致最終的預測值偏移過大,因此KNN迴歸對噪聲點很敏感

  • 3、如果數據量變大,特徵維度變大,那麼會導致KNN搜索變得非常慢

    • 改進方式 :可以使用kd_tree 建立搜索空間
  • 4、KNN分類可以應用於特徵比對(優點不需要訓練,完全依靠底庫(訓練樣本)的特徵)

    • 人臉比對,獲取一張新人臉圖片(要預測的樣本)與底庫(訓練樣本)做比對
    • 圖片搜索等
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章