Welcome, guest | Sign In | My Account | Store | Cart
from collections import namedtuple
from math import fsum

def map_reduce(data, mapper, reducer=None):
   
'''Simple map/reduce for data analysis.

    Each data element is passed to a *mapper* function.
    The mapper returns key/value pairs
    or None for data elements to be skipped.

    Returns a dict with the data grouped into lists.
    If a *reducer* is specified, it aggregates each list.

    >>> def even_odd(elem):                     # sample mapper
    ...     if 10 <= elem <= 20:                # skip elems outside the range
    ...         key = elem % 2                  # group into evens and odds
    ...         return key, elem

    >>> map_reduce(range(30), even_odd)         # show group members
    {0: [10, 12, 14, 16, 18, 20], 1: [11, 13, 15, 17, 19]}

    >>> map_reduce(range(30), even_odd, sum)    # sum each group
    {0: 90, 1: 75}

    '''

    d
= {}
   
for elem in data:
        r
= mapper(elem)
       
if r is not None:
            key
, value = r
           
if key in d:
                d
[key].append(value)
           
else:
                d
[key] = [value]
   
if reducer is not None:
       
for key, group in d.items():
            d
[key] = reducer(group)
   
return d

Summary = namedtuple('Summary', ['n', 'lo', 'mean', 'hi', 'std_dev'])

def describe(data):
   
'Simple reducer for descriptive statistics'
    n
= len(data)
    lo
= min(data)
    hi
= max(data)
    mean
= fsum(data) / n
    std_dev
= (fsum((x - mean) ** 2 for x in data) / n) ** 0.5
   
return Summary(n, lo, mean, hi, std_dev)


if __name__ == '__main__':

   
from pprint import pprint
   
import doctest

   
Person = namedtuple('Person', ['name', 'gender', 'age', 'height'])

    persons
= [
       
Person('mary', 'fem', 21, 60.2),
       
Person('suzy', 'fem', 32, 70.1),
       
Person('jane', 'fem', 27, 58.1),
       
Person('jill', 'fem', 24, 69.1),
       
Person('bess', 'fem', 43, 66.6),
       
Person('john', 'mal', 25, 70.8),
       
Person('jack', 'mal', 40, 59.1),
       
Person('mike', 'mal', 42, 60.3),
       
Person('zack', 'mal', 45, 63.7),
       
Person('alma', 'fem', 34, 67.0),
       
Person('bill', 'mal', 20, 62.1),
   
]

   
def height_by_gender_and_agegroup(p):
        key
= p.gender, p.age //10
        val
= p.height
       
return key, val

    pprint
(persons)                                                      # upgrouped dataset
    pprint
(map_reduce(persons, lambda p: ((p.gender, p.age//10), p)))    # grouped people
    pprint
(map_reduce(persons, height_by_gender_and_agegroup, None))     # grouped heights
    pprint
(map_reduce(persons, height_by_gender_and_agegroup, len))      # size of each group
    pprint
(map_reduce(persons, height_by_gender_and_agegroup, describe)) # describe each group
   
print(doctest.testmod())

Diff to Previous Revision

--- revision 8 2011-04-25 23:47:59
+++ revision 9 2011-05-15 16:46:55
@@ -28,7 +28,10 @@
         r
= mapper(elem)
         
if r is not None:
             key
, value = r
-            d.setdefault(key, []).append(value)
+            if key in d:
+                d[key].append(value)
+            else:
+                d[key] = [value]
     
if reducer is not None:
         
for key, group in d.items():
             d
[key] = reducer(group)

History