Kmeans:最簡單的聚類算法之一。kmeans算法是一種基於樣本間相似性度量的間接聚類方法,屬於非監督學習方法。此算法以k爲參數,把n 個對象分爲k個簇,以使簇內具有較高的相似度,而且簇間的相似度較低。相似度的計算根據一個簇中對象的平均值(被看作簇的重心)來進行。此算法首先隨機選擇k個對象,每個對象代表一個聚類的質心。對於其餘的每一個對象,根據該對象與各聚類質心之間的距離,把它分配到與之最相似的聚類中。然後,計算每個聚類的新質心。重複上述過程,直到準則函數收斂。
import random
import matplotlib.pyplot as plt
import numpy
class KMeans():
def __init__(self,k):
self.__k = k #K表示分類數
self.__data = [] # 存放原始數據
self.__pointCenter = [] #存放中心點,第一次的中心點隨機在__data中抽取
self.__result = [ ]
for i in range(k):
self.__result.append([]) #[[],[],[],[],[]]
def fit(self,data,threshold,times = 50000):
'''
:param data: 原始數據
:param threshold: 退出條件
:param times: 次數
:return:
'''
self.__data = data
self.randomCenter()
centerDistance = self.calPointCenterDistance(self.__pointCenter,self.__data)
#對原始數據進行分類,將每一個點分到離他最近的那個中心點
i= 0
for temp in centerDistance:
index = temp.index(min(temp))
self.__result[index].append(self.__data[i])
i += 1
oldCenterPoint = self.__pointCenter
newCenterPoint = self.calNewPointCenter(self.__result)
while self.calCenterToCenterDistance(oldCenterPoint,newCenterPoint) >threshold:
times -= 1
result = []
for i in range(self.__k):
result.append([])
#保存上次的中心點
oldCenterPoint = newCenterPoint
centerDistance = self.calPointCenterDistance(newCenterPoint,self.__data)
#對原始數據進行分類,將每一個點分到離他最近的那個中心點
i= 0
for temp in centerDistance:
index = temp.index(min(temp))
result[index].append(self.__data[i])# result = [[[10,20]]]
i += 1
newCenterPoint = self.calNewPointCenter(result)
self.__result = result
self.__pointCenter = newCenterPoint
return newCenterPoint, self.__result
def randomCenter(self):
while len(self.__pointCenter)<self.__k:
index = random.randint(0,len(self.__data)-1) #隨機索引
if self.__data[index] not in self.__pointCenter:#避免重複的點
self.__pointCenter.append(self.__data[index])#將點加入__pointCenter[]
def distance(self,pointer1,pointer2):#計算兩個點之間的距離,使用任意維度
distance = (sum([(x1 - x2)**2 for x1,x2 in zip(pointer1,pointer2)]))**0.5
return distance
def calPointCenterDistance(self,center,data):#計算每個點和中心點之間的距離
centerDistance = [ ]
for i in data:
centerDistance.append([self.distance(i,point) for point in center])
return centerDistance #centerDistance中有len(data)組數據,每組數據有k個值
pass
def calCenterToCenterDistance(self,old,new):
'''
計算兩次中心點之間的距離,求和求平均值
:param old:
:param new:
:return:
'''
total = 0
for point1,point2 in zip(old,new):
total += self.distance(point1,point2)
return total / len(old)
def calNewPointCenter(self,result):
'''
計算新的中心點
:param result:
:return:
'''
newCenterPoint = [ ]
for temp in result:
temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
point = [ ]
for t in temps:
#對維度求和,取平均值
point.append(sum(t) / len(t))
newCenterPoint.append(point)
return newCenterPoint
if __name__ == "__main__":
data = [[random.randint(1, 100), random.randint(1, 100)] for i in range(1000)]
for i in range(10):
kmeans = KMeans(k=5)
centerPoint, result = kmeans.fit(data, 0.0001)
print(centerPoint)
plt.plot()
plt.title("KMeans Classification")
i = 0
tempx = []
tempy = []
color = []
for temp in result:
temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
color += [i] * len(temps[0])
tempx += temps[0]
tempy += temps[1]
i += 2
plt.scatter(tempx, tempy, c=color, s=30)
plt.show()