Welcome, guest | Sign In | My Account | Store | Cart

Three ways to calculate a distance matrix out of a list of n-dimensional points using scipy.

Python, 73 lines
 ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74``` ```from scipy import reshape, sqrt, identity # nDimPoints: list of n-dim tuples # distFunc: calculates the distance based on the differences # Ex: Manhatten would be: distFunc=sum(deltaPoint[d] for d in xrange(len(deltaPoint) def calcDistanceMatrix(nDimPoints, distFunc=lambda deltaPoint: sqrt(sum(deltaPoint[d]**2 for d in xrange(len(deltaPoint))))): nDimPoints = array(nDimPoints) dim = len(nDimPoints[0]) delta = [None]*dim for d in xrange(dim): data = nDimPoints[:,d] delta[d] = data - reshape(data,(len(data),1)) # computes all possible combinations dist = distFunc(delta) dist = dist + identity(len(data))*dist.max() # eliminate self matching # dist is the matrix of distances from one coordinate to any other return dist from numpy.matlib import repmat, repeat def calcDistanceMatrixFastEuclidean(points): numPoints = len(points) distMat = sqrt(sum((repmat(points, numPoints, 1) - repeat(points, numPoints, axis=0))**2, axis=1)) return distMat.reshape((numPoints,numPoints)) from numpy import mat, zeros, newaxis def calcDistanceMatrixFastEuclidean2(nDimPoints): nDimPoints = array(nDimPoints) n,m = nDimPoints.shape delta = zeros((n,n),'d') for d in xrange(m): data = nDimPoints[:,d] delta += (data - data[:,newaxis])**2 return sqrt(delta) ################# # Unittest ################# class CalcDistanceMatrixTestCase(unittest.TestCase): def setUp(self): self.distanceMatrixFunc = "calcDistanceMatrix" def test_2D(self): points = [[0, 0], [1, 1], [4, 5]] dm = eval("%s(points)"%self.distanceMatrixFunc) self.assertAlmostEqual(1.414213562373095049, dm[0][1]) self.assertAlmostEqual(6.403124237432848686, dm[0][2]) self.assertAlmostEqual(5, dm[1][2]) self._testSymmetry(dm) def test_3D(self): points = [[0, 0, 0], [1.0, 1, 1], [4, 5, 6], [10,10,10]] dm = eval("%s(points)"%self.distanceMatrixFunc) self.assertAlmostEqual(1.732050807568877294, dm[0][1]) self.assertAlmostEqual(8.77496438739212206, dm[0][2]) self.assertAlmostEqual(17.32050807568877294, dm[0][3]) self.assertAlmostEqual(7.071067811865475244, dm[1][2]) self.assertAlmostEqual(15.58845726811989564, dm[1][3]) self.assertAlmostEqual(8.77496438739212206, dm[2][3]) self._testSymmetry(dm) def _testSymmetry(self, dm): for i in range(len(dm)): for j in range(len(dm)): self.assertEqual(dm[i][j], dm[j][i]) class CalcDistanceMatrixFastTestCase(CalcDistanceMatrixTestCase): def setUp(self): self.distanceMatrixFunc = "calcDistanceMatrixFastEuclidean" class CalcDistanceMatrixFast2TestCase(CalcDistanceMatrixTestCase): def setUp(self): self.distanceMatrixFunc = "calcDistanceMatrixFastEuclidean2" ```

The first algorithm is taken from the very good tutorial http://stsdas.stsci.edu/perry/pydatatut.pdf (chapter 3.7.7 "Finding nearest neighbors") and adapted to the n-dimensional case. Thanks to Keir Mierle for the ...FastEuclidean... functions, which are faster than calcDistanceMatrix by using euclidean distance directly.

 Created by Willi Richert on Mon, 6 Nov 2006 (PSF)