Welcome, guest | Sign In | My Account | Store | Cart
"""Descriptive statistical analysis tool.
"""


__author__
= "Chad J. Schroeder"

__revision__
= "$Id$"
__version__
= "0.1"

__all__
= [ "StatisticsException", "Statistics" ]

class StatisticsException(Exception):
   
"""Statistics Exception class."""
   
pass

class Statistics(object):
   
"""Class for descriptive statistical analysis.

   Behavior:
      Computes numerical statistics for a given data set.

   Available public methods:

      None

   Available instance attributes:

          N: total number of elements in the data set
        sum: sum of all values (n) in the data set
        min: smallest value of the data set
        max: largest value of the data set
       mode: value(s) that appear(s) most often in the data set
       mean: arithmetic average of the data set
      range: difference between the largest and smallest value in the data set
     median: value which is in the exact middle of the data set
   variance: measure of the spread of the data set about the mean
     stddev: standard deviation - measure of the dispersion of the data set
             based on variance

   identification: Instance ID

   Raised Exceptions:    

      StatisticsException

   Bases Classes:

      object (builtin)

   Example Usage:

      x = [ -1, 0, 1 ]

      try:
         stats = Statistics(x)
      except StatisticsException, mesg:
         <handle exception>

      print "
N: %s" % stats.N
      print "
SUM: %s" % stats.sum
      print "
MIN: %s" % stats.min
      print "
MAX: %s" % stats.max
      print "
MODE: %s" % stats.mode
      print "
MEAN: %0.2f" % stats.mean
      print "
RANGE: %s" % stats.range
      print "
MEDIAN: %0.2f" % stats.median
      print "
VARIANCE: %0.5f" % stats.variance
      print "
STDDEV: %0.5f" % stats.stddev
      print "
DATA LIST: %s" % stats.sample

   """

                                                                               
   
def __init__(self, sample=[], population=False):
     
"""Statistics class initializer method."""

     
# Raise an exception if the data set is empty.
     
if (not sample):
         
raise StatisticsException, "Empty data set!: %s" % sample

     
# The data set (a list).
     
self.sample = sample

     
# Sample/Population variance determination flag.
     
self.population = population

     
self.N = len(self.sample)

     
self.sum = float(sum(self.sample))

     
self.min = min(self.sample)

     
self.max = max(self.sample)

     
self.range = self.max - self.min

     
self.mean = self.sum/self.N

     
# Inplace sort (list is now in ascending order).
     
self.sample.sort()

     
self.__getMode()
     
self.__getMedian()
     
self.__getVariance()
     
self.__getStandardDeviation()

     
# Instance identification attribute.
     
self.identification = id(self)

   
def __getMode(self):
     
"""Determine the most repeated value(s) in the data set."""

     
# Initialize a dictionary to store frequency data.
      frequency
= {}

     
# Build dictionary: key - data set values; item - data frequency.
     
for x in self.sample:
         
if (x in frequency):
            frequency
[x] += 1
         
else:
            frequency
[x] = 1

     
# Create a new list containing the values of the frequency dict.  Convert
     
# the list, which may have duplicate elements, into a set.  This will
     
# remove duplicate elements.  Convert the set back into a sorted list
     
# (in descending order).  The first element of the new list now contains
     
# the frequency of the most repeated values(s) in the data set.
     
# mode = sorted(list(set(frequency.values())), reverse=True)[0]
     
# Or use the builtin - max(), which returns the largest item of a
     
# non-empty sequence.
      mode
= max(frequency.values())

     
# If the value of mode is 1, there is no mode for the given data set.
     
if (mode == 1):
         
self.mode = []
         
return

     
# Step through the frequency dictionary, looking for values equaling
     
# the current value of mode.  If found, append the value and its
     
# associated key to the self.mode list.
     
self.mode = [(x, mode) for x in frequency if (mode == frequency[x])]

   
def __getMedian(self):
     
"""Determine the value which is in the exact middle of the data set."""

     
if (self.N%2):            # Number of elements in data set is odd.
         
self.median = float(self.sample[self.N/2])
     
else:
         midpt
= self.N/2       # Number of elements in data set is even.
         
self.median = (self.sample[midpt-1] + self.sample[midpt])/2.0

   
def __getVariance(self):
     
"""Determine the measure of the spread of the data set about the mean.
      Sample variance is determined by default; population variance can be
      determined by setting population attribute to True.
      """


      x
= 0     # Summation variable.

     
# Subtract the mean from each data item and square the difference.
     
# Sum all the squared deviations.
     
for item in self.sample:
         x
+= (item - self.mean)**2.0

     
try:
         
if (not self.population):
           
# Divide sum of squares by N-1 (sample variance).
           
self.variance = x/(self.N-1)
         
else:
           
# Divide sum of squares by N (population variance).
           
self.variance = x/self.N
     
except:
         
self.variance = 0

   
def __getStandardDeviation(self):
     
"""Determine the measure of the dispersion of the data set based on the
      variance.
      """


     
from math import sqrt     # Mathematical functions.

     
# Take the square root of the variance.
     
self.stddev = sqrt(self.variance)

if __name__ == "__main__":

   
import os               # Miscellaneous OS interfaces.
   
import sys              # System-specific parameters and functions.

   
# Self-test

   a
= [ -1, 0, 1 ]
   b
= [ -1.0, 0.0, 1.1 ]
   c
= []
   d
= [ 12.23 ]
   e
= [ 12.23, 99.543, 66.08 ]
   f
= [ -1, 0, 2, -2, 1, 3, 0, -3, 2 ]
   g
= [ 0, 9, 1, 8, 2, 7, 3, 6, 4, 5 ]
   h
= [ -1, -1 ]

   
for x in a, b, c, d, e, f, g, h:
     
try:
         stats
= Statistics(x)
     
except StatisticsException, mesg:
         
print; print "Exception caught: %s" % mesg; print
         
continue
     
print
     
print "N: %s" % stats.N
     
print "SUM: %s" % stats.sum
     
print "MIN: %s" % stats.min
     
print "MAX: %s" % stats.max
     
print "MODE: %s" % stats.mode
     
print "MEAN: %0.2f" % stats.mean
     
print "RANGE: %s" % stats.range
     
print "MEDIAN: %0.2f" % stats.median
     
print "VARIANCE: %0.5f" % stats.variance
     
print "STDDEV: %0.5f" % stats.stddev
     
print "DATA LIST: %s\n" % stats.sample
     
print

   sys
.exit(0)

History

  • revision 2 (19 years ago)
  • previous revisions are not available