#How to do aggregates using groupby, defaultdict and Counter on flat files #This Example computes groups for: #region, count(region), sum(sales), avg(sales), max(sales), min(sales) # #By Nestor Nissen from csv import DictReader from operator import itemgetter from itertools import groupby from collections import defaultdict, Counter rawdata = ''' Region,City,Sales Scotland,Edinburgh,20000 Scotland,Glasgow,12500 Wales,Cardiff,29700 Wales,Bangor,12800 England,London,90000 England,Manchester,45600 England,Liverpool,29700 '''.splitlines()[1:] indata = list(DictReader(rawdata)) print('Using sort and groupby:') counts = [] sums = [] avgs = [] maxs = [] mins = [] ordered_data = sorted(indata, key=itemgetter('Region')) for region, group in groupby(ordered_data, key=itemgetter('Region')): group_list = list(group) count = sum(1 for city in group_list) total = sum(int(city['Sales']) for city in group_list) maxsale = max(int(city['Sales']) for city in group_list) minsale = min(int(city['Sales']) for city in group_list) counts.append((region, count)) sums.append((region, total)) avgs.append((region, total/count)) maxs.append((region, maxsale)) mins.append((region, minsale)) print('count:',counts, '\nsum:',sums, '\navg:',avgs, '\nmax:',maxs, '\nmin:',mins, '\n') print('Using defaultdict:') dd_counts = defaultdict(int) dd_sales = defaultdict(int) dd_maxs = defaultdict(int) dd_mins = defaultdict(lambda: 9**99) for row in indata: region = row['Region'] sales = int(row['Sales']) dd_counts[region] += 1 dd_sales[region] += sales dd_maxs[region] = max(dd_maxs[region], sales) dd_mins[region] = min(dd_mins[region], sales) counts = list(dd_counts.items()) sums = list(dd_sales.items()) avgs = [(key, dd_sales[key]/count) for key, count in dd_counts.items()] maxs = list(dd_maxs.items()) mins = list(dd_mins.items()) print('count:',counts, '\nsum:',sums, '\navg:',avgs, '\nmax:',maxs, '\nmin:',mins, '\n') print('Using counter:') counts = list(Counter(map(itemgetter('Region'), indata)).items()) print('count:',counts)