參考:
機器學習實戰教程
機器學習實戰書籍下載 - - 密碼:qi7q
k-近鄰算法
公式編輯器
drawio畫流程圖
百度思維導圖
1.算法理論
k-近鄰算法 是一種特徵搜索的方法(相似性搜索):
-
1、準備訓練集
- 數據集量化 (將文本數據轉成數字)
- 特徵做歸一化 (訓練集與測試集都做)
-
2、每個測試樣本與訓練集中所有樣本計算距離,按距離排序查找k個樣本
- 可以使用的距離度量方法有:
歐式距離
(常用)- 閔可夫斯基距離
- 曼哈頓距離
- 切比雪夫距離
- 馬哈拉洛比斯距離
- 相似度度量
- 向量空間餘弦相似度
- 皮爾森相關係數
- 可以使用的距離度量方法有:
-
3、統計這k個樣本
- 分類
- 投票機制(少數服從多數)
- 迴歸
- 距離加權平均(類似於插值方式,
距離越小權重越大
)
- 距離加權平均(類似於插值方式,
- 分類
2.實踐
1.knn分類
"""
Author:wucng
Time: 20200107
Summary: 使用KNN(K-近領域)對iris數據分類
數據下載:https://archive.ics.uci.edu/ml/datasets.php
源代碼: https://github.com/wucng/MLAndDL
"""
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.metrics import accuracy_score,auc
import pandas as pd
import numpy as np
import os
import time
# 1.加載數據集(並做預處理)
def loadData(dataPath: str) -> tuple:
# 如果有標題可以省略header,names ;sep 爲數據分割符
df = pd.read_csv(dataPath, sep=",", header=-1,
names=["sepal_length", "sepal_width", "petal_length", "petal_width", "label"])
# 填充缺失值
df = df.fillna(0)
# 數據量化
# 文本量化
df.replace("Iris-setosa", 0, inplace=True)
df.replace("Iris-versicolor", 1, inplace=True)
df.replace("Iris-virginica", 2, inplace=True)
# 劃分出特徵數據與標籤數據
X = df.drop("label", axis=1) # 特徵數據
y = df.label # or df["label"] # 標籤數據
# 數據歸一化
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
# 使用sklearn方式
# X = MinMaxScaler().transform(X)
# 查看df信息
# df.info()
# df.describe()
return (X.to_numpy(), y.to_numpy())
class KNN(object):
"""默認使用歐式距離"""
def __init__(self,X_train:np.asarray,X_test:np.asarray,
y_train:np.asarray,y_test:np.asarray=None,k:int=5):
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
self.k = k
self.__calDistance()
# 2.計算每個測試樣本到訓練樣本的距離
def __calDistance(self):
result_dist = np.zeros([len(self.X_test),len(self.X_train)])
for i,data in enumerate(self.X_test):
data = np.tile(data,(len(self.X_train),1))
distance = np.sqrt(np.sum((data-self.X_train)**2,-1))
# result_dist[i] = sorted(distance) # 從小到大排序,先不排序,否則索引位置發生變化與標籤對應不上
result_dist[i] = distance
self.result_dist = result_dist
# return result_dist
def __calDistance2(self):
result_dist = np.zeros([len(self.X_test), len(self.X_train)])
for i, data_test in enumerate(self.X_test):
dist = np.zeros((len(self.X_train),))
for j,data in enumerate(self.X_train):
dist[j] = (sum((data_test-data)**2))**0.5
result_dist[i] = dist
self.result_dist = result_dist
# return result_dist
# 3.根據距離確定類別
def predict(self):
"""k:爲選取的最近樣本點個數"""
# 距離從小到大排序獲取索引
result_index = np.argsort(self.result_dist,-1)[:,:self.k]
# 將索引替換成對應的標籤
y_pred = self.y_train[result_index]
# 統計每列次數出現最多對應的值即爲預測標籤
y_pred = [np.bincount(pred).argmax() for pred in y_pred]
self.y_pred = np.asarray(y_pred)
return self.y_pred
# 4.計算精度信息
def accuracy(self):
assert self.y_test is not None,print("error")
assert len(self.y_pred)==len(self.y_test),print("error")
return np.sum(self.y_pred==self.y_test)/len(self.y_test)
if __name__ =="__main__":
dataPath = "../../dataset/iris.data"
X,y = loadData(dataPath)
# print(X.shape,y.shape) # (150, 4) (150,)
# 劃分訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state = 42)
start = time.time()
clf = KNN(X_train, X_test, y_train, y_test,3)
y_pred = clf.predict()
# print(y_pred)
print("cost time:%.6f(s) acc:%.3f"%(time.time()-start,clf.accuracy()))
# cost time:0.001994(s) acc:1.000
# ----------------------------------------------------------------------
# 使用sklearn的KNeighborsClassifier方法
start = time.time()
clf = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree').fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print("cost time:%.6f(s) error:%.3f" % (time.time() - start, acc))
# cost time:0.001994(s) error:1.000
2.knn迴歸
"""
Author:wucng
Time: 20200108
Summary: 使用KNN(K-近領域)對boston房價做迴歸預測
數據下載:https://archive.ics.uci.edu/ml/datasets.php
源代碼: https://github.com/wucng/MLAndDL
"""
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.metrics import accuracy_score,auc
import pandas as pd
import numpy as np
import os
import time
# 1.加載數據集(並做預處理)
def loadData(dataPath: str) -> tuple:
with open(dataPath,"r") as fp:
lines = fp.readlines()
dataset=[]
i = 0
while i<len(lines):
line = lines[i]
i += 1
if line[0].isdigit(): # 數字開頭
data1=list(map(float,line.strip().split(" ")))
line = lines[i]
i += 1
data2 = list(map(float, line.strip().split(" ")))
data1.extend(data2)
dataset.append(data1)
else:
continue
dataset = np.asarray(dataset)
# 拆分成訓練集與標籤
X,y = dataset[...,:-1],dataset[...,-1]
# 數據歸一化
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
# 使用sklearn方式
# X = MinMaxScaler().transform(X)
return (X,y)
class KNN(object):
"""默認使用歐式距離"""
def __init__(self,X_train:np.asarray,X_test:np.asarray,
y_train:np.asarray,y_test:np.asarray=None,k:int=5):
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
self.k = k
self.__calDistance()
# 2.計算每個測試樣本到訓練樣本的距離
def __calDistance(self):
result_dist = np.zeros([len(self.X_test),len(self.X_train)])
for i,data in enumerate(self.X_test):
data = np.tile(data,(len(self.X_train),1))
distance = np.sqrt(np.sum((data-self.X_train)**2,-1))
# result_dist[i] = sorted(distance) # 從小到大排序,先不排序,否則索引位置發生變化與標籤對應不上
result_dist[i] = distance
self.result_dist = result_dist
# return result_dist
def __calDistance2(self):
result_dist = np.zeros([len(self.X_test), len(self.X_train)])
for i, data_test in enumerate(self.X_test):
dist = np.zeros((len(self.X_train),))
for j,data in enumerate(self.X_train):
dist[j] = (sum((data_test-data)**2))**0.5
result_dist[i] = dist
self.result_dist = result_dist
# return result_dist
# 3.根據距離確定類別
def predict(self):
"""k:爲選取的最近樣本點個數"""
# 距離從小到大排序獲取索引
result_index = np.argsort(self.result_dist,-1)[:,:self.k]
# 將索引替換成對應的標籤
y_pred = self.y_train[result_index]
# 做距離加權平均得到預測值
# 獲取對應的距離值
dists = self.result_dist.copy()
dists.sort(-1)
dists =dists[...,:self.k]
# 根據距離做距離加權平均(距離越近權重越大)
preds = []
for pred,dist in zip(y_pred,dists):
dist = np.exp(dist*(-1)) # 使用 e^(-x)
preds.append(np.sum(pred*dist/np.sum(dist)))
# 假設滿足正太分佈 f(x) = 1/(sqrt(2*pi)*sigma)*e^(-(x-mu)^2/(2*sigma^2))
# 取標準正太分佈
# dist = 1/(np.sqrt(2*np.pi))*np.exp(-(dist)**2/2)
# preds.append(np.sum(pred * dist / np.sum(dist)))
self.y_pred = np.asarray(preds)
return self.y_pred
# 4.計算精度信息
def error(self):
assert self.y_test is not None,print("error")
assert len(self.y_pred)==len(self.y_test),print("error")
return np.sum((self.y_pred-self.y_test)**2)/len(self.y_test)
if __name__ =="__main__":
dataPath = "../../dataset/boston.txt"
X,y = loadData(dataPath)
print(X.shape,y.shape) # (506, 13) (506,)
# 劃分訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state = 42)
start = time.time()
clf = KNN(X_train, X_test, y_train, y_test,3)
y_pred = clf.predict()
# print(y_pred)
print("cost time:%.6f(s) error:%.3f"%(time.time()-start,clf.error()))
# cost time:0.012968(s) error:19.994
# ----------------------------------------------------------------------
# 使用sklearn的KNeighborsRegressor方法
start = time.time()
clf = KNeighborsRegressor(n_neighbors=3,weights='distance',algorithm='kd_tree').fit(X_train,y_train)
y_pred = clf.predict(X_test)
error = np.sum((y_pred-y_test)**2)/len(y_test)
print("cost time:%.6f(s) error:%.3f"%(time.time()-start,error))
# cost time:0.004056(s) error:18.905
總結
-
1、
KNN分類
,如果按距離選出的K
個樣本中,有一個被噪聲污染(類別不對),但是其他K-1
是沒問題,還是能保證正確類別大於錯誤類別,因此能夠正確分類(KNN分類對噪聲點不太很敏感
) -
2、
KNN迴歸
,如果按距離選出的K
個樣本中,有一個被噪聲污染(其對應的輸出值偏差很大),而且這個噪聲離我們要預測的樣本距離又很近,那麼會造成其佔的比重很大,導致最終的預測值偏移過大,因此KNN迴歸對噪聲點很敏感
-
3、如果數據量變大,特徵維度變大,那麼會導致
KNN
搜索變得非常慢- 改進方式 :可以使用
kd_tree
建立搜索空間
- 改進方式 :可以使用
-
4、KNN分類可以應用於特徵比對(優點不需要訓練,完全依靠底庫(訓練樣本)的特徵)
- 人臉比對,獲取一張新人臉圖片(要預測的樣本)與底庫(訓練樣本)做比對
- 圖片搜索等