Welcome, guest | Sign In | My Account | Store | Cart
#How to do aggregates using groupby, defaultdict and Counter on flat files
#This Example computes groups for:
#region, count(region), sum(sales), avg(sales), max(sales), min(sales)
#
#By Nestor Nissen

from csv import DictReader
from operator import itemgetter
from itertools import groupby
from collections import defaultdict, Counter

rawdata = '''
Region,City,Sales
Scotland,Edinburgh,20000
Scotland,Glasgow,12500
Wales,Cardiff,29700
Wales,Bangor,12800
England,London,90000
England,Manchester,45600
England,Liverpool,29700
'''.splitlines()[1:]

indata = list(DictReader(rawdata))

print('Using sort and groupby:')
counts = []
sums = []
avgs = []
maxs = []
mins = []
ordered_data = sorted(indata, key=itemgetter('Region'))
for region, group in groupby(ordered_data, key=itemgetter('Region')):
    group_list = list(group)
    count = sum(1 for city in group_list)
    total = sum(int(city['Sales']) for city in group_list)
    maxsale = max(int(city['Sales']) for city in group_list)
    minsale = min(int(city['Sales']) for city in group_list)
    counts.append((region, count))
    sums.append((region, total))
    avgs.append((region, total/count))
    maxs.append((region, maxsale))
    mins.append((region, minsale))
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
      '\nmax:',maxs, '\nmin:',mins, '\n')

print('Using defaultdict:')
dd_counts = defaultdict(int)
dd_sales = defaultdict(int)
dd_maxs = defaultdict(int)
dd_mins = defaultdict(lambda: 9**99)
for row in indata:
    region = row['Region']
    sales = int(row['Sales'])
    dd_counts[region] += 1
    dd_sales[region] += sales
    dd_maxs[region] = max(dd_maxs[region], sales)
    dd_mins[region] = min(dd_mins[region], sales)
counts = list(dd_counts.items())
sums = list(dd_sales.items())
avgs = [(key, dd_sales[key]/count) for key, count in dd_counts.items()]
maxs = list(dd_maxs.items())
mins = list(dd_mins.items())
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
      '\nmax:',maxs, '\nmin:',mins, '\n')

print('Using counter:')
counts = list(Counter(map(itemgetter('Region'), indata)).items())
print('count:',counts)

Diff to Previous Revision

--- revision 1 2011-01-07 16:27:43
+++ revision 2 2011-02-07 15:38:09
@@ -1,6 +1,6 @@
 #How to do aggregates using groupby, defaultdict and Counter on flat files
 #This Example computes groups for:
-#region,count(region),sum(sales),avg(sales),max(sales),min(sales)
+#region, count(region), sum(sales), avg(sales), max(sales), min(sales)
 #
 #By Nestor Nissen
 
@@ -9,7 +9,7 @@
 from itertools import groupby
 from collections import defaultdict, Counter
 
-rawdata='''
+rawdata = '''
 Region,City,Sales
 Scotland,Edinburgh,20000
 Scotland,Glasgow,12500
@@ -20,49 +20,49 @@
 England,Liverpool,29700
 '''.splitlines()[1:]
 
-indata=list(DictReader(rawdata))
+indata = list(DictReader(rawdata))
 
 print('Using sort and groupby:')
-counts=[]
-sums=[]
-avgs=[]
-maxs=[]
-mins=[]
-ordered_data=sorted(indata,key=itemgetter('Region'))
-for region,group in groupby(ordered_data,key=itemgetter('Region')):
-    group_list=list(group)
-    count=sum(1 for city in group_list)
-    total=sum(int(city['Sales']) for city in group_list)
-    maxsale=max(int(city['Sales']) for city in group_list)
-    minsale=min(int(city['Sales']) for city in group_list)
-    counts.append((region,count))
-    sums.append((region,total))
-    avgs.append((region,total/count))
-    maxs.append((region,maxsale))
-    mins.append((region,minsale))
-print('count:',counts,'\nsum:',sums,'\navg:',avgs,
-      '\nmax:',maxs,'\nmin:',mins,'\n')
+counts = []
+sums = []
+avgs = []
+maxs = []
+mins = []
+ordered_data = sorted(indata, key=itemgetter('Region'))
+for region, group in groupby(ordered_data, key=itemgetter('Region')):
+    group_list = list(group)
+    count = sum(1 for city in group_list)
+    total = sum(int(city['Sales']) for city in group_list)
+    maxsale = max(int(city['Sales']) for city in group_list)
+    minsale = min(int(city['Sales']) for city in group_list)
+    counts.append((region, count))
+    sums.append((region, total))
+    avgs.append((region, total/count))
+    maxs.append((region, maxsale))
+    mins.append((region, minsale))
+print('count:',counts, '\nsum:',sums, '\navg:',avgs,
+      '\nmax:',maxs, '\nmin:',mins, '\n')
 
 print('Using defaultdict:')
-dd_counts=defaultdict(int)
-dd_sales=defaultdict(int)
-dd_maxs=defaultdict(int)
-dd_mins=defaultdict(lambda: 9**99)
+dd_counts = defaultdict(int)
+dd_sales = defaultdict(int)
+dd_maxs = defaultdict(int)
+dd_mins = defaultdict(lambda: 9**99)
 for row in indata:
-    region=row['Region']
-    sales=int(row['Sales'])
-    dd_counts[region]+=1
-    dd_sales[region]+=sales
-    dd_maxs[region]=max(dd_maxs[region],sales)
-    dd_mins[region]=min(dd_mins[region],sales)
-counts=list(dd_counts.items())
-sums=list(dd_sales.items())
-avgs=[(key,dd_sales[key]/count) for key,count in dd_counts.items()]
-maxs=list(dd_maxs.items())
-mins=list(dd_mins.items())
-print('count:',counts,'\nsum:',sums,'\navg:',avgs,
-      '\nmax:',maxs,'\nmin:',mins,'\n')
+    region = row['Region']
+    sales = int(row['Sales'])
+    dd_counts[region] += 1
+    dd_sales[region] += sales
+    dd_maxs[region] = max(dd_maxs[region], sales)
+    dd_mins[region] = min(dd_mins[region], sales)
+counts = list(dd_counts.items())
+sums = list(dd_sales.items())
+avgs = [(key, dd_sales[key]/count) for key, count in dd_counts.items()]
+maxs = list(dd_maxs.items())
+mins = list(dd_mins.items())
+print('count:',counts, '\nsum:',sums, '\navg:',avgs,
+      '\nmax:',maxs, '\nmin:',mins, '\n')
 
 print('Using counter:')
-counts=list(Counter(map(itemgetter('Region'),indata)).items())
+counts = list(Counter(map(itemgetter('Region'), indata)).items())
 print('count:',counts)

History