import random, copy, math import numpy as np def k_means(t, nbclusters=2, nbiter=3, medoids=False, soft=False, beta=200.0,\ distance=lambda x,y: math.sqrt(np.dot(x-y,(x-y).conj())),\ responsability=lambda beta,d: math.exp(-1 * beta * d)): """ Each row of t is an observation, each column is a feature 'nbclusters' is the number of seeds and so of clusters/centroids 'nbiter' is the number of iterations 'medoids' tells if we use the medoids or centroids method 'distance' is the function to use for comparing observations Overview of the algorithm ("hard k-means"): -> Place nbclusters points into the features space of the objects/t[i:] -> Assign each object to the group that has the closest centroid (distance) -> Recalculate the positions of the nbclusters centroids -> Repeat Steps 2 and 3 until the centroids no longer move We can change the distance function and change the responsability function -> distance will change the shape of the clusters -> responsability will change the breadth of the clusters (& associativity) """ nbobs = t.shape[0] nbfeatures = t.shape[1] # find xranges for each features min_max = [] for f in xrange(nbfeatures): min_max.append((t[:,f].min(), t[:,f].max())) ### Soft => Normalization, otherwise "beta" has no meaning! if soft: for f in xrange(nbfeatures): t[:,f] -= min_max[f][0] t[:,f] /= (min_max[f][1]-min_max[f][0]) min_max = [] for f in xrange(nbfeatures): min_max.append((t[:,f].min(), t[:,f].max())) ### /Normalization # ugly result = {} quality = 0.0 # sum of the means of the distances to centroids random.seed() tmpdist = np.ndarray([nbobs,nbclusters], np.float64) # distance obs<->clust tmpresp = np.ndarray([nbobs,nbclusters], np.float64) # responsability o<->c # iterate for the best quality for iteration in xrange(nbiter): clusters = [[] for c in xrange(nbclusters)] # Step 1: place nbclusters seeds for each features centroids = [np.array([random.uniform(min_max[f][0], min_max[f][1])\ for f in xrange(nbfeatures)], np.float64)\ for c in xrange(nbclusters)] old_centroids = [np.array([-1 for f in xrange(nbfeatures)], np.float64)\ for c in xrange(nbclusters)] # should not be init, TODO new_sum = math.fsum([distance(centroids[c], old_centroids[c])\ for c in xrange(nbclusters)]) old_sum = sys.maxint np.seterr(invalid='raise') # iterate until convergence while new_sum < old_sum : old_centroids = copy.deepcopy(centroids) old_sum = new_sum for c in xrange(nbclusters): clusters[c] = [] # precompute distance to all centroids/medoids for all observations for c in xrange(nbclusters): for o in xrange(nbobs): tmpdist[o,c] = distance(centroids[c], t[o,:]) if soft: # Step 2: compute the degree of assignment for each object for o in xrange(nbobs): for c in xrange(nbclusters): tmpresp[o,c] = responsability(beta, tmpdist[o,c]) for o in xrange(nbobs): tmpresp[o,:] /= math.fsum(tmpresp[o,:]) else: # Step 2: assign each object to the closest centroid for o in xrange(nbobs): clusters[tmpdist[o,:].argmin()].append(o) # Step 3: recalculate the positions of the nbclusters centroids for c in xrange(nbclusters): if medoids: if soft: print "ERROR: Soft medoids not implemented" sys.exit(-1) else: tmpmin = sys.maxint argmin = 0 for o in clusters[c]: if tmpdist[o,c] < tmpmin: tmpmin = tmpdist[o,c] argmin = o centroids[c] = t[argmin,:] else: mean = np.array([0 for i in xrange(nbfeatures)], np.float64) if soft: for o in xrange(nbobs): mean += tmpresp[o,c] * t[o,:] mean /= math.fsum(tmpresp[:,c]) else: for o in clusters[c]: mean += t[o,:] mean = map(lambda x: x/len(clusters[c]), mean) centroids[c] = np.array(mean, np.float64) print centroids new_sum = math.fsum([distance(centroids[c], old_centroids[c])\ for c in xrange(nbclusters)]) print "(k-means) old and new sum: ", old_sum, new_sum if soft: for o in xrange(nbobs): clusters[tmpdist[o,:].argmin()].append(o) quality = math.fsum([math.fsum([tmpdist[o][c] for o in clusters[c]])\ /(len(clusters[c])+1) for c in xrange(nbclusters)]) if not quality in result or quality > result['quality']: result['quality'] = quality result['centroids'] = centroids result['clusters'] = clusters return result