import random, copy, math
import numpy as np
def k_means(t, nbclusters=2, nbiter=3, medoids=False, soft=False, beta=200.0,\
distance=lambda x,y: math.sqrt(np.dot(x-y,(x-y).conj())),\
responsability=lambda beta,d: math.exp(-1 * beta * d)):
"""
Each row of t is an observation, each column is a feature
'nbclusters' is the number of seeds and so of clusters/centroids
'nbiter' is the number of iterations
'medoids' tells if we use the medoids or centroids method
'distance' is the function to use for comparing observations
Overview of the algorithm ("hard k-means"):
-> Place nbclusters points into the features space of the objects/t[i:]
-> Assign each object to the group that has the closest centroid (distance)
-> Recalculate the positions of the nbclusters centroids
-> Repeat Steps 2 and 3 until the centroids no longer move
We can change the distance function and change the responsability function
-> distance will change the shape of the clusters
-> responsability will change the breadth of the clusters (& associativity)
"""
nbobs = t.shape[0]
nbfeatures = t.shape[1]
# find xranges for each features
min_max = []
for f in xrange(nbfeatures):
min_max.append((t[:,f].min(), t[:,f].max()))
### Soft => Normalization, otherwise "beta" has no meaning!
if soft:
for f in xrange(nbfeatures):
t[:,f] -= min_max[f][0]
t[:,f] /= (min_max[f][1]-min_max[f][0])
min_max = []
for f in xrange(nbfeatures):
min_max.append((t[:,f].min(), t[:,f].max()))
### /Normalization # ugly
result = {}
quality = 0.0 # sum of the means of the distances to centroids
random.seed()
tmpdist = np.ndarray([nbobs,nbclusters], np.float64) # distance obs<->clust
tmpresp = np.ndarray([nbobs,nbclusters], np.float64) # responsability o<->c
# iterate for the best quality
for iteration in xrange(nbiter):
clusters = [[] for c in xrange(nbclusters)]
# Step 1: place nbclusters seeds for each features
centroids = [np.array([random.uniform(min_max[f][0], min_max[f][1])\
for f in xrange(nbfeatures)], np.float64)\
for c in xrange(nbclusters)]
old_centroids = [np.array([-1 for f in xrange(nbfeatures)], np.float64)\
for c in xrange(nbclusters)] # should not be init, TODO
new_sum = math.fsum([distance(centroids[c], old_centroids[c])\
for c in xrange(nbclusters)])
old_sum = sys.maxint
np.seterr(invalid='raise')
# iterate until convergence
while new_sum < old_sum :
old_centroids = copy.deepcopy(centroids)
old_sum = new_sum
for c in xrange(nbclusters):
clusters[c] = []
# precompute distance to all centroids/medoids for all observations
for c in xrange(nbclusters):
for o in xrange(nbobs):
tmpdist[o,c] = distance(centroids[c], t[o,:])
if soft:
# Step 2: compute the degree of assignment for each object
for o in xrange(nbobs):
for c in xrange(nbclusters):
tmpresp[o,c] = responsability(beta, tmpdist[o,c])
for o in xrange(nbobs):
tmpresp[o,:] /= math.fsum(tmpresp[o,:])
else:
# Step 2: assign each object to the closest centroid
for o in xrange(nbobs):
clusters[tmpdist[o,:].argmin()].append(o)
# Step 3: recalculate the positions of the nbclusters centroids
for c in xrange(nbclusters):
if medoids:
if soft:
print "ERROR: Soft medoids not implemented"
sys.exit(-1)
else:
tmpmin = sys.maxint
argmin = 0
for o in clusters[c]:
if tmpdist[o,c] < tmpmin:
tmpmin = tmpdist[o,c]
argmin = o
centroids[c] = t[argmin,:]
else:
mean = np.array([0 for i in xrange(nbfeatures)], np.float64)
if soft:
for o in xrange(nbobs):
mean += tmpresp[o,c] * t[o,:]
mean /= math.fsum(tmpresp[:,c])
else:
for o in clusters[c]:
mean += t[o,:]
mean = map(lambda x: x/len(clusters[c]), mean)
centroids[c] = np.array(mean, np.float64)
print centroids
new_sum = math.fsum([distance(centroids[c], old_centroids[c])\
for c in xrange(nbclusters)])
print "(k-means) old and new sum: ", old_sum, new_sum
if soft:
for o in xrange(nbobs):
clusters[tmpdist[o,:].argmin()].append(o)
quality = math.fsum([math.fsum([tmpdist[o][c] for o in clusters[c]])\
/(len(clusters[c])+1) for c in xrange(nbclusters)])
if not quality in result or quality > result['quality']:
result['quality'] = quality
result['centroids'] = centroids
result['clusters'] = clusters
return result
Diff to Previous Revision
--- revision 1 2011-06-04 10:29:32
+++ revision 2 2011-06-04 10:58:09
@@ -23,20 +23,20 @@
"""
nbobs = t.shape[0]
nbfeatures = t.shape[1]
- # find ranges for each features
+ # find xranges for each features
min_max = []
- for f in range(nbfeatures):
+ for f in xrange(nbfeatures):
min_max.append((t[:,f].min(), t[:,f].max()))
### Soft => Normalization, otherwise "beta" has no meaning!
if soft:
- for f in range(nbfeatures):
+ for f in xrange(nbfeatures):
t[:,f] -= min_max[f][0]
t[:,f] /= (min_max[f][1]-min_max[f][0])
min_max = []
- for f in range(nbfeatures):
+ for f in xrange(nbfeatures):
min_max.append((t[:,f].min(), t[:,f].max()))
- ### /Normalization
+ ### /Normalization # ugly
result = {}
quality = 0.0 # sum of the means of the distances to centroids
@@ -44,41 +44,41 @@
tmpdist = np.ndarray([nbobs,nbclusters], np.float64) # distance obs<->clust
tmpresp = np.ndarray([nbobs,nbclusters], np.float64) # responsability o<->c
# iterate for the best quality
- for iteration in range(nbiter):
- clusters = [[] for c in range(nbclusters)]
+ for iteration in xrange(nbiter):
+ clusters = [[] for c in xrange(nbclusters)]
# Step 1: place nbclusters seeds for each features
centroids = [np.array([random.uniform(min_max[f][0], min_max[f][1])\
- for f in range(nbfeatures)], np.float64)\
- for c in range(nbclusters)]
- old_centroids = [np.array([-1 for f in range(nbfeatures)], np.float64)\
- for c in range(nbclusters)] # should not be init, TODO
+ for f in xrange(nbfeatures)], np.float64)\
+ for c in xrange(nbclusters)]
+ old_centroids = [np.array([-1 for f in xrange(nbfeatures)], np.float64)\
+ for c in xrange(nbclusters)] # should not be init, TODO
new_sum = math.fsum([distance(centroids[c], old_centroids[c])\
- for c in range(nbclusters)])
+ for c in xrange(nbclusters)])
old_sum = sys.maxint
np.seterr(invalid='raise')
# iterate until convergence
while new_sum < old_sum :
old_centroids = copy.deepcopy(centroids)
old_sum = new_sum
- for c in range(nbclusters):
+ for c in xrange(nbclusters):
clusters[c] = []
# precompute distance to all centroids/medoids for all observations
- for c in range(nbclusters):
- for o in range(nbobs):
+ for c in xrange(nbclusters):
+ for o in xrange(nbobs):
tmpdist[o,c] = distance(centroids[c], t[o,:])
if soft:
# Step 2: compute the degree of assignment for each object
- for o in range(nbobs):
- for c in range(nbclusters):
+ for o in xrange(nbobs):
+ for c in xrange(nbclusters):
tmpresp[o,c] = responsability(beta, tmpdist[o,c])
- for o in range(nbobs):
+ for o in xrange(nbobs):
tmpresp[o,:] /= math.fsum(tmpresp[o,:])
else:
# Step 2: assign each object to the closest centroid
- for o in range(nbobs):
+ for o in xrange(nbobs):
clusters[tmpdist[o,:].argmin()].append(o)
# Step 3: recalculate the positions of the nbclusters centroids
- for c in range(nbclusters):
+ for c in xrange(nbclusters):
if medoids:
if soft:
print "ERROR: Soft medoids not implemented"
@@ -92,9 +92,9 @@
argmin = o
centroids[c] = t[argmin,:]
else:
- mean = np.array([0 for i in range(nbfeatures)], np.float64)
+ mean = np.array([0 for i in xrange(nbfeatures)], np.float64)
if soft:
- for o in range(nbobs):
+ for o in xrange(nbobs):
mean += tmpresp[o,c] * t[o,:]
mean /= math.fsum(tmpresp[:,c])
else:
@@ -104,13 +104,13 @@
centroids[c] = np.array(mean, np.float64)
print centroids
new_sum = math.fsum([distance(centroids[c], old_centroids[c])\
- for c in range(nbclusters)])
+ for c in xrange(nbclusters)])
print "(k-means) old and new sum: ", old_sum, new_sum
if soft:
- for o in range(nbobs):
+ for o in xrange(nbobs):
clusters[tmpdist[o,:].argmin()].append(o)
quality = math.fsum([math.fsum([tmpdist[o][c] for o in clusters[c]])\
- /(len(clusters[c])+1) for c in range(nbclusters)])
+ /(len(clusters[c])+1) for c in xrange(nbclusters)])
if not quality in result or quality > result['quality']:
result['quality'] = quality
result['centroids'] = centroids