1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219  """Descriptive statistical analysis tool.
"""
__author__ = "Chad J. Schroeder"
__revision__ = "$Id$"
__version__ = "0.1"
__all__ = [ "StatisticsException", "Statistics" ]
class StatisticsException(Exception):
"""Statistics Exception class."""
pass
class Statistics(object):
"""Class for descriptive statistical analysis.
Behavior:
Computes numerical statistics for a given data set.
Available public methods:
None
Available instance attributes:
N: total number of elements in the data set
sum: sum of all values (n) in the data set
min: smallest value of the data set
max: largest value of the data set
mode: value(s) that appear(s) most often in the data set
mean: arithmetic average of the data set
range: difference between the largest and smallest value in the data set
median: value which is in the exact middle of the data set
variance: measure of the spread of the data set about the mean
stddev: standard deviation  measure of the dispersion of the data set
based on variance
identification: Instance ID
Raised Exceptions:
StatisticsException
Bases Classes:
object (builtin)
Example Usage:
x = [ 1, 0, 1 ]
try:
stats = Statistics(x)
except StatisticsException, mesg:
<handle exception>
print "N: %s" % stats.N
print "SUM: %s" % stats.sum
print "MIN: %s" % stats.min
print "MAX: %s" % stats.max
print "MODE: %s" % stats.mode
print "MEAN: %0.2f" % stats.mean
print "RANGE: %s" % stats.range
print "MEDIAN: %0.2f" % stats.median
print "VARIANCE: %0.5f" % stats.variance
print "STDDEV: %0.5f" % stats.stddev
print "DATA LIST: %s" % stats.sample
"""
def __init__(self, sample=[], population=False):
"""Statistics class initializer method."""
# Raise an exception if the data set is empty.
if (not sample):
raise StatisticsException, "Empty data set!: %s" % sample
# The data set (a list).
self.sample = sample
# Sample/Population variance determination flag.
self.population = population
self.N = len(self.sample)
self.sum = float(sum(self.sample))
self.min = min(self.sample)
self.max = max(self.sample)
self.range = self.max  self.min
self.mean = self.sum/self.N
# Inplace sort (list is now in ascending order).
self.sample.sort()
self.__getMode()
self.__getMedian()
self.__getVariance()
self.__getStandardDeviation()
# Instance identification attribute.
self.identification = id(self)
def __getMode(self):
"""Determine the most repeated value(s) in the data set."""
# Initialize a dictionary to store frequency data.
frequency = {}
# Build dictionary: key  data set values; item  data frequency.
for x in self.sample:
if (x in frequency):
frequency[x] += 1
else:
frequency[x] = 1
# Create a new list containing the values of the frequency dict. Convert
# the list, which may have duplicate elements, into a set. This will
# remove duplicate elements. Convert the set back into a sorted list
# (in descending order). The first element of the new list now contains
# the frequency of the most repeated values(s) in the data set.
# mode = sorted(list(set(frequency.values())), reverse=True)[0]
# Or use the builtin  max(), which returns the largest item of a
# nonempty sequence.
mode = max(frequency.values())
# If the value of mode is 1, there is no mode for the given data set.
if (mode == 1):
self.mode = []
return
# Step through the frequency dictionary, looking for values equaling
# the current value of mode. If found, append the value and its
# associated key to the self.mode list.
self.mode = [(x, mode) for x in frequency if (mode == frequency[x])]
def __getMedian(self):
"""Determine the value which is in the exact middle of the data set."""
if (self.N%2): # Number of elements in data set is odd.
self.median = float(self.sample[self.N/2])
else:
midpt = self.N/2 # Number of elements in data set is even.
self.median = (self.sample[midpt1] + self.sample[midpt])/2.0
def __getVariance(self):
"""Determine the measure of the spread of the data set about the mean.
Sample variance is determined by default; population variance can be
determined by setting population attribute to True.
"""
x = 0 # Summation variable.
# Subtract the mean from each data item and square the difference.
# Sum all the squared deviations.
for item in self.sample:
x += (item  self.mean)**2.0
try:
if (not self.population):
# Divide sum of squares by N1 (sample variance).
self.variance = x/(self.N1)
else:
# Divide sum of squares by N (population variance).
self.variance = x/self.N
except:
self.variance = 0
def __getStandardDeviation(self):
"""Determine the measure of the dispersion of the data set based on the
variance.
"""
from math import sqrt # Mathematical functions.
# Take the square root of the variance.
self.stddev = sqrt(self.variance)
if __name__ == "__main__":
import os # Miscellaneous OS interfaces.
import sys # Systemspecific parameters and functions.
# Selftest
a = [ 1, 0, 1 ]
b = [ 1.0, 0.0, 1.1 ]
c = []
d = [ 12.23 ]
e = [ 12.23, 99.543, 66.08 ]
f = [ 1, 0, 2, 2, 1, 3, 0, 3, 2 ]
g = [ 0, 9, 1, 8, 2, 7, 3, 6, 4, 5 ]
h = [ 1, 1 ]
for x in a, b, c, d, e, f, g, h:
try:
stats = Statistics(x)
except StatisticsException, mesg:
print; print "Exception caught: %s" % mesg; print
continue
print
print "N: %s" % stats.N
print "SUM: %s" % stats.sum
print "MIN: %s" % stats.min
print "MAX: %s" % stats.max
print "MODE: %s" % stats.mode
print "MEAN: %0.2f" % stats.mean
print "RANGE: %s" % stats.range
print "MEDIAN: %0.2f" % stats.median
print "VARIANCE: %0.5f" % stats.variance
print "STDDEV: %0.5f" % stats.stddev
print "DATA LIST: %s\n" % stats.sample
print
sys.exit(0)

Preexisting solutions. Very useful, but ...
There is a similar module in development within the python cvs tree: python/nondist/sandbox/statistics/statistics.py
There is also a nice stats module at http://www.nmr.mgh.harvard.edu/Neural_Systems_Group/gary/python.html
SciPy (http://www.scipy.org) also has some statisticsrelated functions: http://www.scipy.org/documentation/apidocs/scipy/scipy.stats.html
Bug in variance computation. You use x both as loop variable and summation variable, so that the result is bogus. E.g. Statistics([1, 1]) gives a negative variance (and thus an exception from sqrt).
Also, computing the mode is more complicated than necessary: why not use
(Converting to a set does not save anything; one must touch all the elements of the list anyway, so it's O(n). And even if one wants to do list(set(lst)) first, max(lst) is faster than sorted(lst)[1] or sorted(lst, reverse=True)[0].)
Mode and variance. Thanks for pointing out the summation oversight and mode computation improvement. Modifications have been merged.
Chad, thanks for the code. I had a project in VBA for ArcGIS that I'm moving to Python, this was what I needed. However, when I call it from the main module (terms may not be right since new to Python) I get the same stats for the 2nd  n runs as the 1st.
My code generates a list named lstYield of points within a specific polygon
stat_analysis.Statistics(lstYield)
then the median, max, and std dev will be saved to the table for that record in the polygon's attribute table, but like I said I get one viable run, then it repeats the same N and stats. I've checked the list going in, and it is changing. I tried setting the stats.N, stats.median, etc to 0 before getting the next list of points, then all variables return as 0. Any thoughts would be greatly appreciated. Kim
Chad, figured it out, had to say blockStats = stat_analysis.Statistics(lstYield), then it ran just fine. Sorry for jumping the gun in posting question, but thanks again for the code!