1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
| import numpy as np
def distEclud(vecA, vecB): ''' 欧氏距离 INPUT -> 向量A和B OUTPUT-> A和B间的欧式距离 ''' return np.sqrt(sum(np.power(vecA - vecB, 2))) def newCent(L): ''' 初始化中心点 INPUT -> 有标签数据集L OUTPUT-> 根据L确定初始聚类中心 ''' centroids = [] label_list = np.unique(L[:,-1]) for i in label_list: L_i = L[(L[:,-1])==i] cent_i = np.mean(L_i,0) centroids.append(cent_i[:-1]) return np.array(centroids)
def semi_kMeans(L, U, distMeas=distEclud, initial_centriod=newCent): ''' 半监督K均值聚类 INPUT -> 有标签数据集L(最后一列为类别标签)、无标签数据集U(无类别标签) OUTPUT-> 聚类结果 ''' dataSet = np.vstack((L[:,:-1], U)) label_list = np.unique(L[:,-1]) k = len(label_list) m = np.shape(dataSet)[0] clusterAssment = np.zeros(m) centroids = initial_centriod(L) clusterChanged = True while clusterChanged: clusterChanged = False for i in range(m): minDist = np.inf; minIndex = -1 for j in range(k): distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i] != minIndex: clusterChanged = True clusterAssment[i] = minIndex return clusterAssment
if __name__ == '__main__' :
L = np.array([[1.0, 4.2 , 1], [1.3, 4.0 , 1], [1.0, 4.0 , 1], [1.5, 4.3 , 1], [2.0, 4.0 , 0], [2.3, 3.7 , 0], [4.0, 1.0 , 0]]) U = np.array([[1.4, 5.0], [1.3, 5.4], [2.0, 5.0], [4.0, 2.0], [5.0, 1.0], [5.0, 2.0]])
clusterResult = semi_kMeans(L, U) print(clusterResult)
|