參考文獻:New Balanced Active Learning Model and Optimization Algorithm--2018IJCAI
原文代碼找了好久沒有找到,至少Github上沒有找到(2020-06-10之前),爲了深入學習只能自己擼碼了。
代碼還沒進行類封裝。目前效果一般,可能是參數沒設置好。有空好好改一下。
## K-Means 調用了sklearn包,沒有按照論文所述的100次聚類結果選優。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from collections import OrderedDict
import copy
def t_updata(t_old):
return .5 + 0.5 * np.sqrt(1 + 4 * t_old)
def C_cal(W,X,gamma):
return W - gamma * (X.T * X * W - X.T * X )
def P_zeta_1(a,b,zeta):
a_abs = np.abs(sorted(a,reverse=True))
t = 0
a_abs_temp = np.append(a_abs,0)
for j in range(0,3):#range(len(a)):
t = t + a_abs[j]
delta = (t - b) / (pow(zeta,-1) + j+1)
if a_abs_temp[j+1] <= delta <= a_abs_temp[j]:
a_abs_delta = np.abs(a) - delta
# a_abs_delta = a_abs - delta
Zero = np.zeros(len(a_abs))
a_abs_delta = np.maximum(Zero,a_abs_delta)
X = np.sign(a) * a_abs_delta
y = np.linalg.norm(X,1)
return X,y
X = a
y = b
return X,y
def KMeans_results(X,Cluster_Num):
kmeans = KMeans(n_clusters=Cluster_Num,random_state=101).fit(X)
clusterIndex = kmeans.labels_
clusterDict = OrderedDict()
for k in range(Cluster_Num):
clusterDict[k] = []
for i in range(len(X)):
clusterDict[clusterIndex[i]].append(i)
return clusterDict
def Problem(X,W_old,G,gamma):
t_old = 1
W_current = copy.deepcopy(W_old)
W_new = np.zeros((len(W_old),len(W_old)))
iter_num = 10
while iter_num > 0:
# print("迭代剩餘次數=",iter_num)
C = C_cal(W=W_current,X=X,gamma=gamma) #調用函數計算大C,d(*) 爲求導; C = W - r* d(F(W))
for g in G.values(): #對每個類簇執行以下運算
T = np.zeros(len(g))
for i,gi in enumerate(g):
T[i] = np.linalg.norm(C[gi],2)
S,y = P_zeta_1(a=T,b=0,zeta=1)
for i,gi in enumerate(g):
W_new[gi,:] = (S[i]/T[i])*C[gi,:]
t_new = t_updata(t_old)
print("t_new = ",t_new)
W_current = W_new + (W_new - W_old)*(t_old - 1)/t_new
W_old = copy.deepcopy(W_new)
t_old = copy.deepcopy(t_new)
iter_num -= 1
return W_new
if __name__ == '__main__':
# X,y = datasets.make_blobs(n_samples=50,n_features=2,centers=3,cluster_std=[1,1,1],random_state=101)
#--------------------------------------#
path1 = r'E:\dataset\ExperimentalData\Aggregation\aggregation.csv'
path2 = r'E:\dataset\ExperimentalData\Three blobs\ThreeBlobs.csv'
path3 = r'E:\dataset\ExperimentalData\R15\R15.csv'
data = np.array(pd.read_csv(path3, header=None))
X = data[:, :-1]
XX = copy.deepcopy(X)
y = data[:, -1]
#--------------------------------------#
K = len(set(y))
print("類簇個數=",K)
ClustG = KMeans_results(X, Cluster_Num=15)
X = np.mat(X).T
N = X.shape[1]
print('樣本個數=',N)
# W = np.random.random((N,N))
# W = np.random.randint(1,10,[N,N])
# W = np.random.random((N,N))
gamma = 0.00000000000000001
# np.random.seed(101)
W_old = np.random.random((N,N))-0.5
W_new = Problem(X=X, W_old=W_old, G=ClustG, gamma=gamma)
row_sum_abs = np.linalg.norm(W_new,ord=1,axis=1)
Ord_rank = np.flipud(np.argsort(row_sum_abs))
# X = np.array(X.T)
# print(X.shape)
# print(Ord_rank[0:20])
for i in range(20):
print("======",row_sum_abs[Ord_rank[i]])
E = XX[Ord_rank[0:15],:]
# print(Ord_rank[0:])
# plt.scatter(XX[:,0],XX[:,1],c=y)
for v in ClustG.values():
plt.scatter(XX[v, 0], XX[v, 1])
plt.scatter(E[:,0],E[:,1],c='r',marker='*',s=300)
plt.show()
梯度步長gamma需要tuning,這個有點惱火。
矩陣W的初始化方法也會影響樣本選擇結果。