-
-
Notifications
You must be signed in to change notification settings - Fork 46k
/
k_means_clustering_tensorflow.py
146 lines (123 loc) · 5.86 KB
/
k_means_clustering_tensorflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from random import shuffle
import tensorflow as tf
from numpy import array
def tf_k_means_cluster(vectors, noofclusters):
"""
K-Means Clustering using TensorFlow.
'vectors' should be a n*k 2-D NumPy array, where n is the number
of vectors of dimensionality k.
'noofclusters' should be an integer.
"""
noofclusters = int(noofclusters)
assert noofclusters < len(vectors)
# Find out the dimensionality
dim = len(vectors[0])
# Will help select random centroids from among the available vectors
vector_indices = list(range(len(vectors)))
shuffle(vector_indices)
# GRAPH OF COMPUTATION
# We initialize a new graph and set it as the default during each run
# of this algorithm. This ensures that as this function is called
# multiple times, the default graph doesn't keep getting crowded with
# unused ops and Variables from previous function calls.
graph = tf.Graph()
with graph.as_default():
# SESSION OF COMPUTATION
sess = tf.Session()
##CONSTRUCTING THE ELEMENTS OF COMPUTATION
##First lets ensure we have a Variable vector for each centroid,
##initialized to one of the vectors from the available data points
centroids = [
tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
]
##These nodes will assign the centroid Variables the appropriate
##values
centroid_value = tf.placeholder("float64", [dim])
cent_assigns = []
for centroid in centroids:
cent_assigns.append(tf.assign(centroid, centroid_value))
##Variables for cluster assignments of individual vectors(initialized
##to 0 at first)
assignments = [tf.Variable(0) for i in range(len(vectors))]
##These nodes will assign an assignment Variable the appropriate
##value
assignment_value = tf.placeholder("int32")
cluster_assigns = []
for assignment in assignments:
cluster_assigns.append(tf.assign(assignment, assignment_value))
##Now lets construct the node that will compute the mean
# The placeholder for the input
mean_input = tf.placeholder("float", [None, dim])
# The Node/op takes the input and computes a mean along the 0th
# dimension, i.e. the list of input vectors
mean_op = tf.reduce_mean(mean_input, 0)
##Node for computing Euclidean distances
# Placeholders for input
v1 = tf.placeholder("float", [dim])
v2 = tf.placeholder("float", [dim])
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))
##This node will figure out which cluster to assign a vector to,
##based on Euclidean distances of the vector from the centroids.
# Placeholder for input
centroid_distances = tf.placeholder("float", [noofclusters])
cluster_assignment = tf.argmin(centroid_distances, 0)
##INITIALIZING STATE VARIABLES
##This will help initialization of all Variables defined with respect
##to the graph. The Variable-initializer should be defined after
##all the Variables have been constructed, so that each of them
##will be included in the initialization.
init_op = tf.initialize_all_variables()
# Initialize all variables
sess.run(init_op)
##CLUSTERING ITERATIONS
# Now perform the Expectation-Maximization steps of K-Means clustering
# iterations. To keep things simple, we will only do a set number of
# iterations, instead of using a Stopping Criterion.
noofiterations = 100
for _ in range(noofiterations):
##EXPECTATION STEP
##Based on the centroid locations till last iteration, compute
##the _expected_ centroid assignments.
# Iterate over each vector
for vector_n in range(len(vectors)):
vect = vectors[vector_n]
# Compute Euclidean distance between this vector and each
# centroid. Remember that this list cannot be named
#'centroid_distances', since that is the input to the
# cluster assignment node.
distances = [
sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
for centroid in centroids
]
# Now use the cluster assignment node, with the distances
# as the input
assignment = sess.run(
cluster_assignment, feed_dict={centroid_distances: distances}
)
# Now assign the value to the appropriate state variable
sess.run(
cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
)
##MAXIMIZATION STEP
# Based on the expected state computed from the Expectation Step,
# compute the locations of the centroids so as to maximize the
# overall objective of minimizing within-cluster Sum-of-Squares
for cluster_n in range(noofclusters):
# Collect all the vectors assigned to this cluster
assigned_vects = [
vectors[i]
for i in range(len(vectors))
if sess.run(assignments[i]) == cluster_n
]
# Compute new centroid location
new_location = sess.run(
mean_op, feed_dict={mean_input: array(assigned_vects)}
)
# Assign value to appropriate variable
sess.run(
cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
)
# Return centroids and assignments
centroids = sess.run(centroids)
assignments = sess.run(assignments)
return centroids, assignments