Welcome, guest | Sign In | My Account | Store | Cart
"""Descriptive statistical analysis tool.
"""

__author__ = "Chad J. Schroeder"

__revision__ = "$Id$"
__version__ = "0.1"

__all__ = [ "StatisticsException", "Statistics" ]

class StatisticsException(Exception):
   """Statistics Exception class."""
   pass

class Statistics(object):
   """Class for descriptive statistical analysis.

   Behavior:
      Computes numerical statistics for a given data set.

   Available public methods:

      None

   Available instance attributes:

          N: total number of elements in the data set
        sum: sum of all values (n) in the data set
        min: smallest value of the data set
        max: largest value of the data set
       mode: value(s) that appear(s) most often in the data set
       mean: arithmetic average of the data set
      range: difference between the largest and smallest value in the data set
     median: value which is in the exact middle of the data set
   variance: measure of the spread of the data set about the mean
     stddev: standard deviation - measure of the dispersion of the data set
             based on variance

   identification: Instance ID

   Raised Exceptions:    

      StatisticsException

   Bases Classes:

      object (builtin)

   Example Usage:

      x = [ -1, 0, 1 ]

      try:
         stats = Statistics(x)
      except StatisticsException, mesg:
         <handle exception>

      print "N: %s" % stats.N
      print "SUM: %s" % stats.sum
      print "MIN: %s" % stats.min
      print "MAX: %s" % stats.max
      print "MODE: %s" % stats.mode
      print "MEAN: %0.2f" % stats.mean
      print "RANGE: %s" % stats.range
      print "MEDIAN: %0.2f" % stats.median
      print "VARIANCE: %0.5f" % stats.variance
      print "STDDEV: %0.5f" % stats.stddev
      print "DATA LIST: %s" % stats.sample

   """
                                                                                
   def __init__(self, sample=[], population=False):
      """Statistics class initializer method."""

      # Raise an exception if the data set is empty.
      if (not sample):
         raise StatisticsException, "Empty data set!: %s" % sample

      # The data set (a list).
      self.sample = sample

      # Sample/Population variance determination flag.
      self.population = population

      self.N = len(self.sample)

      self.sum = float(sum(self.sample))

      self.min = min(self.sample)

      self.max = max(self.sample)

      self.range = self.max - self.min

      self.mean = self.sum/self.N

      # Inplace sort (list is now in ascending order).
      self.sample.sort()

      self.__getMode()
      self.__getMedian()
      self.__getVariance()
      self.__getStandardDeviation()

      # Instance identification attribute.
      self.identification = id(self)

   def __getMode(self):
      """Determine the most repeated value(s) in the data set."""

      # Initialize a dictionary to store frequency data.
      frequency = {}

      # Build dictionary: key - data set values; item - data frequency.
      for x in self.sample:
         if (x in frequency):
            frequency[x] += 1
         else:
            frequency[x] = 1

      # Create a new list containing the values of the frequency dict.  Convert
      # the list, which may have duplicate elements, into a set.  This will
      # remove duplicate elements.  Convert the set back into a sorted list
      # (in descending order).  The first element of the new list now contains
      # the frequency of the most repeated values(s) in the data set.
      # mode = sorted(list(set(frequency.values())), reverse=True)[0]
      # Or use the builtin - max(), which returns the largest item of a
      # non-empty sequence.
      mode = max(frequency.values())

      # If the value of mode is 1, there is no mode for the given data set.
      if (mode == 1):
         self.mode = []
         return

      # Step through the frequency dictionary, looking for values equaling
      # the current value of mode.  If found, append the value and its
      # associated key to the self.mode list.
      self.mode = [(x, mode) for x in frequency if (mode == frequency[x])]

   def __getMedian(self):
      """Determine the value which is in the exact middle of the data set."""

      if (self.N%2):		# Number of elements in data set is odd.
         self.median = float(self.sample[self.N/2])
      else:
         midpt = self.N/2	# Number of elements in data set is even.
         self.median = (self.sample[midpt-1] + self.sample[midpt])/2.0

   def __getVariance(self):
      """Determine the measure of the spread of the data set about the mean.
      Sample variance is determined by default; population variance can be
      determined by setting population attribute to True.
      """

      x = 0	# Summation variable.

      # Subtract the mean from each data item and square the difference.
      # Sum all the squared deviations.
      for item in self.sample:
         x += (item - self.mean)**2.0

      try:
         if (not self.population):
            # Divide sum of squares by N-1 (sample variance).
            self.variance = x/(self.N-1)
         else:
            # Divide sum of squares by N (population variance).
            self.variance = x/self.N
      except:
         self.variance = 0

   def __getStandardDeviation(self):
      """Determine the measure of the dispersion of the data set based on the
      variance.
      """

      from math import sqrt     # Mathematical functions.

      # Take the square root of the variance.
      self.stddev = sqrt(self.variance)

if __name__ == "__main__":

   import os               # Miscellaneous OS interfaces.
   import sys              # System-specific parameters and functions.

   # Self-test

   a = [ -1, 0, 1 ]
   b = [ -1.0, 0.0, 1.1 ]
   c = []
   d = [ 12.23 ]
   e = [ 12.23, 99.543, 66.08 ]
   f = [ -1, 0, 2, -2, 1, 3, 0, -3, 2 ]
   g = [ 0, 9, 1, 8, 2, 7, 3, 6, 4, 5 ]
   h = [ -1, -1 ]

   for x in a, b, c, d, e, f, g, h:
      try:
         stats = Statistics(x)
      except StatisticsException, mesg:
         print; print "Exception caught: %s" % mesg; print
         continue
      print
      print "N: %s" % stats.N
      print "SUM: %s" % stats.sum
      print "MIN: %s" % stats.min
      print "MAX: %s" % stats.max
      print "MODE: %s" % stats.mode
      print "MEAN: %0.2f" % stats.mean
      print "RANGE: %s" % stats.range
      print "MEDIAN: %0.2f" % stats.median
      print "VARIANCE: %0.5f" % stats.variance
      print "STDDEV: %0.5f" % stats.stddev
      print "DATA LIST: %s\n" % stats.sample
      print

   sys.exit(0)

History

  • revision 2 (19 years ago)
  • previous revisions are not available