優點:精度高,對異常值不敏感,無數據輸入假定
缺點:計算複雜度高,空間複雜度高
適用數據範圍:數值型和標稱型
僞代碼:
對未知類別屬性的數據集中的每個點一次執行以下操作:
1.計算一直類別數據集中的點與當前點之間的距離;
2.按照距離遞增次序排序
3.選取與當前點距離最小的K個點
4.確定前K個點所在類別的出現頻率
5.返回前K個點出現頻率最高的類別作爲當前點的預測分類
程序清單:
from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.0],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX,(dataSetSize,1))-dataSet
sqDiffMat = diffMat **2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) +1
soortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
return soortedClassCount[0][0]
#從文件中提取數據
def file2matrix(filename):
fr = open(filename)
arrayOlines = fr.readlines()
numberOfLines = len(arrayOlines)
returnMat = zeros((numberOfLines,3))#創建以0填充的矩陣NumPy
#Numpy矩陣
classLabelVector = []
index = 0
#(以下三行)解析文本數據到列表
for line in arrayOlines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]#選取前三個元素
classLabelVector.append(listFromLine[-1])
index +=1
return returnMat,classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0) #參數零是的函數可以從列中選取最小值,而不是選取當前行的最小值
maxVals = dataSet.max(0)
ranges = maxVals - minVals #計算可能的取值範圍
normDataset = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataset = dataSet-tile(minVals,(m,1))
normDataset = normDataset/tile(maxVals,(m,1)) #特徵值相除
return normDataset,ranges,minVals
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('G:\kaggle\pratice\machinelearninginaction-master\Ch02\datingTestSet2.txt')
group,labels = createDataSet()
datingDataMat ,datingLabels = file2matrix("G:\kaggle\pratice\machinelearninginaction-master\Ch02\datingTestSet.txt")
normMat , ranges , minVals = autoNorm(datingDataMat)
print(normMat," ",ranges)
註解:
>>e = array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.]])
>>e.shape()
(3,3)
>>e.shape[0] #有多少行
3
numpy.tile([0,0],5) #在列方向重複[0,0]五次
numpy.tile([0,0],(1,1)) #在列方向重複[0,0]一次,在行方向重複[0,0]一次