#How to do aggregates using groupby, defaultdict and Counter on flat files
#This Example computes groups for:
#region, count(region), sum(sales), avg(sales), max(sales), min(sales)
#
#By Nestor Nissen
from csv import DictReader
from operator import itemgetter
from itertools import groupby
from collections import defaultdict, Counter
rawdata = '''
Region,City,Sales
Scotland,Edinburgh,20000
Scotland,Glasgow,12500
Wales,Cardiff,29700
Wales,Bangor,12800
England,London,90000
England,Manchester,45600
England,Liverpool,29700
'''.splitlines()[1:]
indata = list(DictReader(rawdata))
print('Using sort and groupby:')
counts = []
sums = []
avgs = []
maxs = []
mins = []
ordered_data = sorted(indata, key=itemgetter('Region'))
for region, group in groupby(ordered_data, key=itemgetter('Region')):
group_list = list(group)
count = sum(1 for city in group_list)
total = sum(int(city['Sales']) for city in group_list)
maxsale = max(int(city['Sales']) for city in group_list)
minsale = min(int(city['Sales']) for city in group_list)
counts.append((region, count))
sums.append((region, total))
avgs.append((region, total/count))
maxs.append((region, maxsale))
mins.append((region, minsale))
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
'\nmax:',maxs, '\nmin:',mins, '\n')
print('Using defaultdict:')
dd_counts = defaultdict(int)
dd_sales = defaultdict(int)
dd_maxs = defaultdict(int)
dd_mins = defaultdict(lambda: 9**99)
for row in indata:
region = row['Region']
sales = int(row['Sales'])
dd_counts[region] += 1
dd_sales[region] += sales
dd_maxs[region] = max(dd_maxs[region], sales)
dd_mins[region] = min(dd_mins[region], sales)
counts = list(dd_counts.items())
sums = list(dd_sales.items())
avgs = [(key, dd_sales[key]/count) for key, count in dd_counts.items()]
maxs = list(dd_maxs.items())
mins = list(dd_mins.items())
print('count:',counts, '\nsum:',sums, '\navg:',avgs,
'\nmax:',maxs, '\nmin:',mins, '\n')
print('Using counter:')
counts = list(Counter(map(itemgetter('Region'), indata)).items())
print('count:',counts)
Diff to Previous Revision
--- revision 1 2011-01-07 16:27:43
+++ revision 2 2011-02-07 15:38:09
@@ -1,6 +1,6 @@
#How to do aggregates using groupby, defaultdict and Counter on flat files
#This Example computes groups for:
-#region,count(region),sum(sales),avg(sales),max(sales),min(sales)
+#region, count(region), sum(sales), avg(sales), max(sales), min(sales)
#
#By Nestor Nissen
@@ -9,7 +9,7 @@
from itertools import groupby
from collections import defaultdict, Counter
-rawdata='''
+rawdata = '''
Region,City,Sales
Scotland,Edinburgh,20000
Scotland,Glasgow,12500
@@ -20,49 +20,49 @@
England,Liverpool,29700
'''.splitlines()[1:]
-indata=list(DictReader(rawdata))
+indata = list(DictReader(rawdata))
print('Using sort and groupby:')
-counts=[]
-sums=[]
-avgs=[]
-maxs=[]
-mins=[]
-ordered_data=sorted(indata,key=itemgetter('Region'))
-for region,group in groupby(ordered_data,key=itemgetter('Region')):
- group_list=list(group)
- count=sum(1 for city in group_list)
- total=sum(int(city['Sales']) for city in group_list)
- maxsale=max(int(city['Sales']) for city in group_list)
- minsale=min(int(city['Sales']) for city in group_list)
- counts.append((region,count))
- sums.append((region,total))
- avgs.append((region,total/count))
- maxs.append((region,maxsale))
- mins.append((region,minsale))
-print('count:',counts,'\nsum:',sums,'\navg:',avgs,
- '\nmax:',maxs,'\nmin:',mins,'\n')
+counts = []
+sums = []
+avgs = []
+maxs = []
+mins = []
+ordered_data = sorted(indata, key=itemgetter('Region'))
+for region, group in groupby(ordered_data, key=itemgetter('Region')):
+ group_list = list(group)
+ count = sum(1 for city in group_list)
+ total = sum(int(city['Sales']) for city in group_list)
+ maxsale = max(int(city['Sales']) for city in group_list)
+ minsale = min(int(city['Sales']) for city in group_list)
+ counts.append((region, count))
+ sums.append((region, total))
+ avgs.append((region, total/count))
+ maxs.append((region, maxsale))
+ mins.append((region, minsale))
+print('count:',counts, '\nsum:',sums, '\navg:',avgs,
+ '\nmax:',maxs, '\nmin:',mins, '\n')
print('Using defaultdict:')
-dd_counts=defaultdict(int)
-dd_sales=defaultdict(int)
-dd_maxs=defaultdict(int)
-dd_mins=defaultdict(lambda: 9**99)
+dd_counts = defaultdict(int)
+dd_sales = defaultdict(int)
+dd_maxs = defaultdict(int)
+dd_mins = defaultdict(lambda: 9**99)
for row in indata:
- region=row['Region']
- sales=int(row['Sales'])
- dd_counts[region]+=1
- dd_sales[region]+=sales
- dd_maxs[region]=max(dd_maxs[region],sales)
- dd_mins[region]=min(dd_mins[region],sales)
-counts=list(dd_counts.items())
-sums=list(dd_sales.items())
-avgs=[(key,dd_sales[key]/count) for key,count in dd_counts.items()]
-maxs=list(dd_maxs.items())
-mins=list(dd_mins.items())
-print('count:',counts,'\nsum:',sums,'\navg:',avgs,
- '\nmax:',maxs,'\nmin:',mins,'\n')
+ region = row['Region']
+ sales = int(row['Sales'])
+ dd_counts[region] += 1
+ dd_sales[region] += sales
+ dd_maxs[region] = max(dd_maxs[region], sales)
+ dd_mins[region] = min(dd_mins[region], sales)
+counts = list(dd_counts.items())
+sums = list(dd_sales.items())
+avgs = [(key, dd_sales[key]/count) for key, count in dd_counts.items()]
+maxs = list(dd_maxs.items())
+mins = list(dd_mins.items())
+print('count:',counts, '\nsum:',sums, '\navg:',avgs,
+ '\nmax:',maxs, '\nmin:',mins, '\n')
print('Using counter:')
-counts=list(Counter(map(itemgetter('Region'),indata)).items())
+counts = list(Counter(map(itemgetter('Region'), indata)).items())
print('count:',counts)