DBscan算法是典型的基於密度的聚類算法,假定類別可以通過樣本分佈的緊密程度決定。在DBscan算法中,將樣本密集的一個區域內的點作爲一個cluster,再將相連的clueter聚合爲一個cluster。
- 把所有的數據點區分並標記爲核心點、邊界點和噪音點
刪除噪音點 - 所有在半徑Eps內的核心點間由邊相連
- 聯通的核心點作爲一組成爲一個cluster
- 分配每一個邊界點到它所落入的核心點半徑範圍內的cluster中
若在某點半徑Eps內有>=minPts個點,則該點就是核心點,
若點不是核心點,但是被其他核心點包括,則爲邊界點
若又不是核心點,又沒被包括,則爲離羣點(噪音點
- 距離選擇
DBscan對不同的距離指標總體上效果差異不大,選擇歐式距離作爲距離指標。 - 參數選擇
在DBscan中不需要預先決定cluster的數目,算法自動決定聚類的數目,但是需要設定Eps和MinPts兩個參數。在本實驗中對掃描半徑Eps和最小包含點MinPts 的取值如下:
minPts = [3 4 5 6 7 8 9]
Eps = [0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29]
dbscan.py
import numpy as np
import data_utils as du
def get_cores_area(data, dist_matrix, eps, minPts):
core_points = {}
if dist_matrix is None:
dist_matrix = pairwise_distance(data, data)
for i, core_near_dis in enumerate(dist_matrix):
inner_points = [] # 包含的點
for j, dis in enumerate(core_near_dis):
if i == j:
continue
if dis < eps:
inner_points.append(j)
if len(inner_points) >= minPts:
core_points[i] = inner_points # 標記核心點族羣所包含的點
return core_points
# 計算兩點歐式距離
def pairwise_distance(x, y):
xx = np.sum(x * x, axis=1, keepdims=True)
yy = np.sum(y * y, axis=1, keepdims=True)
xy = np.matmul(x, y.T)
d = xx + yy.T - 2 * xy
return d
def dbscan(data, eps, minPts):
dist_matrix = pairwise_distance(data, data)
cores_area = get_cores_area(data, dist_matrix, eps, minPts)
clusters = []
while len(cores_area) > 0:
cluster = set()
visit_set = set()
core, nears = cores_area.popitem()
visit_set.add(core)
visit_set.update(nears)
while len(visit_set) > 0:
point = visit_set.pop()
cluster.add(point)
if point in cores_area:
visit_set.update(cores_area.pop(point))
clusters.append(cluster)
cluster = np.zeros(len(data))
for i, c in enumerate(clusters):
for item in c:
cluster[item] = i
return cluster
if __name__ == '__main__':
# print("----DBscan----")
data, labels = du.read_data()
eps_set = np.arange(0.15, 0.3, 0.01).round(2)
minPts_set = np.arange(3, 10, 1)
for eps in eps_set:
for minPts in minPts_set:
print("eps=", eps, ", minPts=", minPts)
cluster = dbscan(data, eps, minPts)
purity, fscore, precision, recall = du.standard(cluster)
print("purity=", purity, ", F-score=", fscore)
# print("----------------")
data_utils.py
import pandas as pd
import numpy as np
def read_data():
pf = pd.read_csv('Frogs_MFCCs.csv', encoding='utf-8')
data = np.array(pf)
f_data = data[:, 0:22].astype(np.float)
return f_data, data[:, 22]
def standard(cluster):
data, labels = read_data()
label_set = sorted(list(set(labels)))
purity = get_purity(cluster, labels, label_set)
fscore, precision, recall, TP, FP, FN, TN = f_score(cluster, labels)
return purity, fscore, precision, recall
if __name__ == '__main__':
f_data, labels = read_data()
label_set = list(set(labels))
代碼及數據下載:https://download.csdn.net/download/SAM2un/12036441