python機器學習基礎kmeans

Kmeans:最簡單的聚類算法之一。kmeans算法是一種基於樣本間相似性度量的間接聚類方法,屬於非監督學習方法。此算法以k爲參數,把n 個對象分爲k個簇,以使簇內具有較高的相似度,而且簇間的相似度較低。相似度的計算根據一個簇中對象的平均值(被看作簇的重心)來進行。此算法首先隨機選擇k個對象,每個對象代表一個聚類的質心。對於其餘的每一個對象,根據該對象與各聚類質心之間的距離,把它分配到與之最相似的聚類中。然後,計算每個聚類的新質心。重複上述過程,直到準則函數收斂。

import random
import matplotlib.pyplot as plt
import numpy

class KMeans():

    def __init__(self,k):
        self.__k = k     #K表示分類數
        self.__data = [] # 存放原始數據
        self.__pointCenter = [] #存放中心點,第一次的中心點隨機在__data中抽取
        self.__result = [ ]
        for i in range(k):
            self.__result.append([])  #[[],[],[],[],[]]

    def fit(self,data,threshold,times = 50000):
        '''
        :param data:  原始數據
        :param threshold: 退出條件
        :param times: 次數
        :return:
        '''
        self.__data = data
        self.randomCenter()
        centerDistance = self.calPointCenterDistance(self.__pointCenter,self.__data)
        #對原始數據進行分類,將每一個點分到離他最近的那個中心點
        i= 0
        for temp in centerDistance:
            index = temp.index(min(temp))
            self.__result[index].append(self.__data[i])
            i += 1
        oldCenterPoint = self.__pointCenter
        newCenterPoint = self.calNewPointCenter(self.__result)

        while self.calCenterToCenterDistance(oldCenterPoint,newCenterPoint) >threshold:
            times -= 1
            result = []
            for i in range(self.__k):
                result.append([])
            #保存上次的中心點
            oldCenterPoint = newCenterPoint
            centerDistance = self.calPointCenterDistance(newCenterPoint,self.__data)

            #對原始數據進行分類,將每一個點分到離他最近的那個中心點
            i= 0
            for temp in centerDistance:
                index = temp.index(min(temp))
                result[index].append(self.__data[i])# result = [[[10,20]]]
                i += 1
            newCenterPoint = self.calNewPointCenter(result)
            self.__result = result
        self.__pointCenter = newCenterPoint
        return newCenterPoint, self.__result

    def randomCenter(self):
        while len(self.__pointCenter)<self.__k:
            index = random.randint(0,len(self.__data)-1) #隨機索引
            if self.__data[index] not in self.__pointCenter:#避免重複的點
                self.__pointCenter.append(self.__data[index])#將點加入__pointCenter[]

    def distance(self,pointer1,pointer2):#計算兩個點之間的距離,使用任意維度

        distance = (sum([(x1 - x2)**2 for x1,x2 in zip(pointer1,pointer2)]))**0.5
        return distance

    def calPointCenterDistance(self,center,data):#計算每個點和中心點之間的距離

        centerDistance = [ ]
        for i in data:
            centerDistance.append([self.distance(i,point) for point in center])
        return centerDistance  #centerDistance中有len(data)組數據,每組數據有k個值
        pass

    def calCenterToCenterDistance(self,old,new):

        '''
        計算兩次中心點之間的距離,求和求平均值
        :param old:
        :param new:
        :return:
        '''
        total = 0
        for point1,point2 in zip(old,new):
            total += self.distance(point1,point2)
        return total / len(old)

    def calNewPointCenter(self,result):
        '''
        計算新的中心點
        :param result:
        :return:
        '''
        newCenterPoint = [ ]
        for temp in result:
            temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
            point = [ ]
            for t in temps:
                #對維度求和,取平均值
                point.append(sum(t) / len(t))
            newCenterPoint.append(point)
        return newCenterPoint

if __name__ == "__main__":

    data = [[random.randint(1, 100), random.randint(1, 100)] for i in range(1000)]
    for i in range(10):
        kmeans = KMeans(k=5)
        centerPoint, result = kmeans.fit(data, 0.0001)
        print(centerPoint)
        plt.plot()
        plt.title("KMeans Classification")
        i = 0
        tempx = []
        tempy = []
        color = []
        for temp in result:
            temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
            color += [i] * len(temps[0])
            tempx += temps[0]
            tempy += temps[1]
            i += 2
        plt.scatter(tempx, tempy, c=color, s=30)
        plt.show()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章