非监督学习之聚类算法(5)-- 基于网格

“基于网格”指将对象空间量化为有限数量的单元,形成一个网格结构,所有的聚类都在这个网格结构上进行。基本思想是将每个属性的可能值分割成许多相邻的区间,创建网格单元的集合。每个对象落入一个网格单元,网格单元对应的属性区间包含该对象的值。
基于网格的聚类算法的优点是它的处理速度很快,其处理时间独立于数据对象的数目,它只与量化空间中的每一维的单元数目有关。

CLIQUE算法

CLIQUE算法是基于网格的空间聚类算法,但它同时也非常好的结合了基于密度的聚类算法,因此既能够发现任意形状的簇,又可以像基于网格的算法一样处理较大的多维数据。
CLIQUE算法背后的直觉是存在于k维空间中的聚类也可以在k-1中找到。我们从1D开始,每个维度我们都试图找到密集网格。如果2个或更多密集网格相邻,我们将它们合并成同一个簇。
CLIQUE算法需要两个参数:一个是网格矩阵的边长,第二个是密度的阈值。网格步长确定了空间的划分,而密度阈值用来定义密集网格。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# coding: utf-8
import os
import sys
import numpy as np
import scipy.sparse.csgraph

#========================================================
# 辅助工具
#========================================================

def read_data(delimiter, feature_columns, path):
'''
读取特征
'''
return np.genfromtxt(path, dtype=float, delimiter=delimiter, usecols=feature_columns)

def normalize_features(data):
'''
数据归一化
'''
normalized_data = data
number_of_features = np.shape(normalized_data)[1]
for f in range(number_of_features):
normalized_data[:, f] -= min(normalized_data[:, f]) - 1e-5
normalized_data[:, f] *= 1 / (max(normalized_data[:, f]) + 1e-5)
return normalized_data

def save_to_file(clusters, file_name):
'''
输出结果到文件
'''
file = open(os.path.join(os.path.abspath(os.path.dirname(__file__)), file_name), encoding='utf-8', mode="w+")
for i, c in enumerate(clusters):
c.id = i
file.write("Cluster " + str(i) + ":\n" + str(c))
file.close()

#========================================================
# 核心算法
#========================================================

class clique:
'''
clique网格聚类
'''
def __init__(self, data, side, threshold):
'''
参数初始化
'''
self.data = data
self.side = side
self.threshold = threshold

class ClusterStructure:
'''
聚类存储结构
'''
def __init__(self, dense_units, data_point_ids):
self.id = None
self.dense_units = dense_units # 密度单元
self.data_point_ids = data_point_ids # 观测点id集合

def __str__(self):
return "Dense units: " + str(self.dense_units.tolist()) + "\nCluster size: " + str(len(self.data_point_ids)) + "\nData points:\n" + str(self.data_point_ids) + "\n"

def get_one_dim_dense_units(self):
'''
搜索一维密度网格
'''
number_of_data_points = np.shape(self.data)[0]
number_of_features = np.shape(self.data)[1]
projection = np.zeros((self.side, number_of_features))
for f in range(number_of_features):
for element in data[:, f]:
projection[int(element * self.side % self.side), f] += 1
is_dense = projection > self.threshold * number_of_data_points
one_dim_dense_units = [] # 1D的密集网格
for f in range(number_of_features):
for unit in range(self.side):
if is_dense[unit, f]:
one_dim_dense_units.append([[f, unit]])
return one_dim_dense_units

def get_dense_units_for_dim(self, prev_dim_dense_units, dim):
'''
在指定维度搜索密度网格
'''
def insert_if_join_condition(candidates, item, item2, current_dim):
joined = []
for i in range(len(item)):
joined.append(item[i])
for i in range(len(item2)):
joined.append(item2[i])
# 计算维数
dims = set()
for i in range(len(joined)):
dims.add(int(joined[i][0]))
if len(dims) == current_dim:
candidates.append(joined)

def self_join(prev_dim_dense_units, dim):
candidates = []
for i in range(len(prev_dim_dense_units)):
for j in range(i + 1, len(prev_dim_dense_units)):
insert_if_join_condition(candidates, prev_dim_dense_units[i], prev_dim_dense_units[j], dim)
return candidates

def prune(candidates, prev_dim_dense_units):
for i in range(len(candidates)):
for j in range(len(candidates[i])):
if not prev_dim_dense_units.__contains__([candidates[i][j]]):
candidates.remove(candidates[i])
break

def is_data_in_projection(tuple, candidate, r):
for dim in candidate:
element = tuple[dim[0]]
if int(element * r % r) != dim[1]:
return False
return True

candidates = self_join(prev_dim_dense_units, dim)
prune(candidates, prev_dim_dense_units)

projection = np.zeros(len(candidates))
number_of_data_points = np.shape(data)[0]
for dataIndex in range(number_of_data_points):
for i in range(len(candidates)):
if is_data_in_projection(data[dataIndex], candidates[i], self.side):
projection[i] += 1

is_dense = projection > self.threshold * number_of_data_points
return np.array(candidates)[is_dense]

def get_clusters(self, dense_units):
'''
对密集网格进行合并
'''
def build_graph_from_dense_units(dense_units):
# 将密集网格转化为图
def get_edge(node1, node2):
# 获取两个节点之间的边
dim = len(node1)
distance = 0
for i in range(dim):
if node1[i][0] != node2[i][0]:
return 0
distance += abs(node1[i][1] - node2[i][1])
if distance > 1:
return 0
return 1
graph = np.identity(len(dense_units))
for i in range(len(dense_units)):
for j in range(len(dense_units)):
graph[i, j] = get_edge(dense_units[i], dense_units[j])
return graph

def get_cluster_data_point_ids(data, cluster_dense_units, xsi):
# 获取聚类中点的id
point_ids = set()
for i in range(np.shape(cluster_dense_units)[0]):
tmp_ids = set(range(np.shape(data)[0]))
for j in range(np.shape(cluster_dense_units)[1]):
feature_index = cluster_dense_units[i][j][0]
range_index = cluster_dense_units[i][j][1]
tmp_ids = tmp_ids & set(np.where(np.floor(data[:, feature_index] * xsi % xsi) == range_index)[0])
point_ids = point_ids | tmp_ids
return point_ids

graph = build_graph_from_dense_units(dense_units)
number_of_components, component_list = scipy.sparse.csgraph.connected_components(graph, directed=False)

dense_units = np.array(dense_units)
clusters = []
for i in range(number_of_components):
cluster_dense_units = dense_units[np.where(component_list == i)]

# 获取聚类中点id
cluster_data_point_ids = get_cluster_data_point_ids(self.data, cluster_dense_units, self.side)
clusters.append(clique.ClusterStructure(cluster_dense_units, cluster_data_point_ids))

return clusters

def main(self):
# 搜索一维密集网格
dense_units = self.get_one_dim_dense_units()

# 向上合并
clusters = []
current_dim = 2
number_of_features = np.shape(data)[1]
while (current_dim <= number_of_features) & (len(dense_units) > 0):
dense_units = self.get_dense_units_for_dim(dense_units, current_dim)
if current_dim == number_of_features:
for cluster in self.get_clusters(dense_units):
clusters.append(cluster)
current_dim += 1

return clusters

#========================================================
# 主程序
#========================================================

if __name__ == "__main__":

# 参数配置
file_name = "mouse.csv"
feature_columns = [0, 1] # 特征列
side = 3 # 空间棋盘尺寸
threshold = 0.1 # 密度阈值
delimiter = ' ' # 分割符
output_file = "clusters.txt" # 输出文件

print("Running CLIQUE algorithm on " + file_name + " dataset, feature columns = " + str(feature_columns))

# 读取同目录下的数据文件
path = os.path.join(os.path.abspath(os.path.dirname(__file__)), file_name)
original_data = read_data(delimiter, feature_columns, path)

# 数据归一化
data = normalize_features(original_data)

# 运行clique聚类
c = clique(data=data,
side=side,
threshold=threshold)
clusters = c.main()

# 保存聚类结果
save_to_file(clusters, output_file)
print("\n Clusters exported to " + output_file)
0%