Welcome, guest | Sign In | My Account | Store | Cart

Bag/multiset class for convenient tallying of hashable objects.

Python, 189 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from operator import itemgetter
from heapq import nlargest
from itertools import repeat, ifilter

class Counter(dict):
    '''Dict subclass for counting hashable objects.  Sometimes called a bag
    or multiset.  Elements are stored as dictionary keys and their counts
    are stored as dictionary values.

    >>> Counter('zyzygy')
    Counter({'y': 3, 'z': 2, 'g': 1})

    '''

    def __init__(self, iterable=None, **kwds):
        '''Create a new, empty Counter object.  And if given, count elements
        from an input iterable.  Or, initialize the count from another mapping
        of elements to their counts.

        >>> c = Counter()                           # a new, empty counter
        >>> c = Counter('gallahad')                 # a new counter from an iterable
        >>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
        >>> c = Counter(a=4, b=2)                   # a new counter from keyword args

        '''        
        self.update(iterable, **kwds)

    def __missing__(self, key):
        return 0

    def most_common(self, n=None):
        '''List the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.

        >>> Counter('abracadabra').most_common(3)
        [('a', 5), ('r', 2), ('b', 2)]

        '''        
        if n is None:
            return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
        return nlargest(n, self.iteritems(), key=itemgetter(1))

    def elements(self):
        '''Iterator over elements repeating each as many times as its count.

        >>> c = Counter('ABCABC')
        >>> sorted(c.elements())
        ['A', 'A', 'B', 'B', 'C', 'C']

        If an element's count has been set to zero or is a negative number,
        elements() will ignore it.

        '''
        for elem, count in self.iteritems():
            for _ in repeat(None, count):
                yield elem

    # Override dict methods where the meaning changes for Counter objects.

    @classmethod
    def fromkeys(cls, iterable, v=None):
        raise NotImplementedError(
            'Counter.fromkeys() is undefined.  Use Counter(iterable) instead.')

    def update(self, iterable=None, **kwds):
        '''Like dict.update() but add counts instead of replacing them.

        Source can be an iterable, a dictionary, or another Counter instance.

        >>> c = Counter('which')
        >>> c.update('witch')           # add elements from another iterable
        >>> d = Counter('watch')
        >>> c.update(d)                 # add elements from another counter
        >>> c['h']                      # four 'h' in which, witch, and watch
        4

        '''        
        if iterable is not None:
            if hasattr(iterable, 'iteritems'):
                if self:
                    self_get = self.get
                    for elem, count in iterable.iteritems():
                        self[elem] = self_get(elem, 0) + count
                else:
                    dict.update(self, iterable) # fast path when counter is empty
            else:
                self_get = self.get
                for elem in iterable:
                    self[elem] = self_get(elem, 0) + 1
        if kwds:
            self.update(kwds)

    def copy(self):
        'Like dict.copy() but returns a Counter instance instead of a dict.'
        return Counter(self)

    def __delitem__(self, elem):
        'Like dict.__delitem__() but does not raise KeyError for missing values.'
        if elem in self:
            dict.__delitem__(self, elem)

    def __repr__(self):
        if not self:
            return '%s()' % self.__class__.__name__
        items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
        return '%s({%s})' % (self.__class__.__name__, items)

    # Multiset-style mathematical operations discussed in:
    #       Knuth TAOCP Volume II section 4.6.3 exercise 19
    #       and at http://en.wikipedia.org/wiki/Multiset
    #
    # Outputs guaranteed to only include positive counts.
    #
    # To strip negative and zero counts, add-in an empty counter:
    #       c += Counter()

    def __add__(self, other):
        '''Add counts from two counters.

        >>> Counter('abbb') + Counter('bcc')
        Counter({'b': 4, 'c': 2, 'a': 1})


        '''
        if not isinstance(other, Counter):
            return NotImplemented
        result = Counter()
        for elem in set(self) | set(other):
            newcount = self[elem] + other[elem]
            if newcount > 0:
                result[elem] = newcount
        return result

    def __sub__(self, other):
        ''' Subtract count, but keep only results with positive counts.

        >>> Counter('abbbc') - Counter('bccd')
        Counter({'b': 2, 'a': 1})

        '''
        if not isinstance(other, Counter):
            return NotImplemented
        result = Counter()
        for elem in set(self) | set(other):
            newcount = self[elem] - other[elem]
            if newcount > 0:
                result[elem] = newcount
        return result

    def __or__(self, other):
        '''Union is the maximum of value in either of the input counters.

        >>> Counter('abbb') | Counter('bcc')
        Counter({'b': 3, 'c': 2, 'a': 1})

        '''
        if not isinstance(other, Counter):
            return NotImplemented
        _max = max
        result = Counter()
        for elem in set(self) | set(other):
            newcount = _max(self[elem], other[elem])
            if newcount > 0:
                result[elem] = newcount
        return result

    def __and__(self, other):
        ''' Intersection is the minimum of corresponding counts.

        >>> Counter('abbb') & Counter('bcc')
        Counter({'b': 1})

        '''
        if not isinstance(other, Counter):
            return NotImplemented
        _min = min
        result = Counter()
        if len(self) < len(other):
            self, other = other, self
        for elem in ifilter(self.__contains__, other):
            newcount = _min(self[elem], other[elem])
            if newcount > 0:
                result[elem] = newcount
        return result


if __name__ == '__main__':
    import doctest
    print doctest.testmod()

This recipe was added to the collections module in Python 2.7. It has been modified here so that it runs on Python 2.5 or later. See the online docs at: http://docs.python.org/dev/library/collections.html#counter-objects

It is a simple dictionary subclass. By defining __missing__(), it automatically treats missing elements as having a count of zero. The most_common() method lists counts in decreasing order of frequency. The elements() method lists all elements with repeats as many times as their multiplicity. The update() method takes either an iterable of elements or a mapping of elements to their counts (the mapping can be another counter object). Unlike dict.update(), this method adds-in counts instead of replaces them.

Examples:

Tally occurrences of words in a list

>>> cnt = Counter()
>>> for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
...     cnt[word] += 1
>>> cnt
Counter({'blue': 3, 'red': 2, 'green': 1})

Find the ten most common words in Hamlet

>>> import re
>>> words = re.findall('\w+', open('hamlet.txt').read().lower())
>>> Counter(hamlet_words).most_common(10)
[('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),
 ('you', 554),  ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]

Multiset examples

>>> c = Counter(a=3, b=1)
>>> d = Counter(a=1, b=2)
>>> c + d                           # add two counters together:  c[x] + d[x]
Counter({'a': 4, 'b': 3})
>>> c - d                           # subtract (keeping only positive counts)
Counter({'a': 2})
>>> c & d                           # intersection:  min(c[x], d[x])
Counter({'a': 1, 'b': 1})
>>> c | d                           # union:  max(c[x], d[x])
Counter({'a': 3, 'b': 2})

Answer to Larry's question: the signature for Counter.update() matches the signatures of set.update() and dict.update() which also allow an empty argument. Also, the empty positional argument is handy when keyword arguments are being used: c.update(tests_run=1, tests_remaining=-1, tests_passed=result).

4 comments

Larry Hastings 15 years, 2 months ago  # | flag

A minor question--why is the signature of update() like that? I don't see why anyone would call Counter.update() without an argument. I'd drop the "=None" from the arguments list and the "if iterable is not None:" statement. Or am I missing something?

In a similar vein, under what circumstances would Counter.__repr__ ever be called with self=None? How would you even do that? Python 2.6 won't let me. Again, I wonder what I'm missing.

Louis RIVIERE 15 years, 2 months ago  # | flag

This may be usefull :

def subtract(self, iterable):
    if hasattr(iterable, 'iteritems'):
        for elem, count in iterable.iteritems():
            self[elem] -= count
    else:
        for elem in iterable:
            self[elem] -= 1

It's the same thing as update() though, so it could be factorized.

gump huang 10 years, 8 months ago  # | flag

Great! and I am Registering to say that maybe "words" should be replaced by "hamlet_words" or contrarily!

Martin Miller 10 years, 4 months ago  # | flag

Regarding Louis RIVIERE's comment about also having a subtract() method:

Besides potentially being useful, the version of the Counter class in Python 2.7 includes a method of that name -- so it should be added to this recipe for a more complete emulation. Until that officially happens, here's a copy of the source code of it from CPython 2.7.5:

from collections import Mapping  # added in Python 2.4

def subtract(self, iterable=None, **kwds):
    '''Like dict.update() but subtracts counts instead of replacing them.
    Counts can be reduced below zero.  Both the inputs and outputs are
    allowed to contain zero and negative counts.

    Source can be an iterable, a dictionary, or another Counter instance.

    >>> c = Counter('which')
    >>> c.subtract('witch')             # subtract elements from another iterable
    >>> c.subtract(Counter('watch'))    # subtract elements from another counter
    >>> c['h']                          # 2 in which, minus 1 in witch, minus 1 in watch
    0
    >>> c['w']                          # 1 in which, minus 1 in witch, minus 1 in watch
    -1
    '''
    if iterable is not None:
        self_get = self.get
        if isinstance(iterable, Mapping):
            for elem, count in iterable.items():
                self[elem] = self_get(elem, 0) - count
        else:
            for elem in iterable:
                self[elem] = self_get(elem, 0) - 1
    if kwds:
        self.subtract(kwds)

Note that unlike __sub__(), this method accepts an iterable or mapping as an argument (rather than only another Counter object). Another significant difference between them -- as noted in the doc string -- is that items that end up with a count of zero (or less) are not removed by the above.