Welcome, guest | Sign In | My Account | Store | Cart

Python needs a unique function, this is my best try so far. It uses the faster algorithm (among five) for each situation.

Python, 157 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from itertools import groupby # for unique function.
from bisect import bisect_left, insort_left # for unique function.


def unique(seq, stable=False):
    """unique(seq, stable=False): return a list of the elements in seq in arbitrary
    order, but without duplicates.
    If stable=True it keeps the original element order (using slower algorithms)."""
    # Developed from Tim Peters version:
    #   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560

    #if uniqueDebug and len(str(seq))<50: print "Input:", seq # For debugging.

    # Special case of an empty s:
    if not seq: return []

    # if it's a set:
    if isinstance(seq, set): return list(seq)

    if stable:
        # Try with a set:
        seqSet= set()
        result = []
        try:
            for e in seq:
                if e not in seqSet:
                    result.append(e)
                    seqSet.add(e)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Stable, set."
            return result

        # Since you can't hash all elements, use a bisection on sorted elements
        result = []
        sortedElem = []
        try:
            for elem in seq:
                pos = bisect_left(sortedElem, elem)
                if pos >= len(sortedElem) or sortedElem[pos] != elem:
                    insort_left(sortedElem, elem)
                    result.append(elem)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Stable, bisect."
            return result
    else: # Not stable
        # Try using a set first, because it's the fastest and it usually works
        try:
            u = set(seq)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Unstable, set."
            return list(u)

        # Elements can't be hashed, so bring equal items together with a sort and
        # remove them out in a single pass.
        try:
            t = sorted(seq)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Unstable, sorted."
            return [elem for elem,group in groupby(t)]

    # Brute force:
    result = []
    for elem in seq:
        if elem not in result:
            result.append(elem)
    #if uniqueDebug: print "Brute force (" + ("Unstable","Stable")[stable] + ")."
    return result


# Following a suggestion from Alex Martelli, sometimes this uniquePick
# is faster for more than about 300 unsortable and unhashable elements:

from cPickle import dumps

def uniquePick(seq):
    result = []
    seen = set()
    for elem in seq:
        key = dumps(elem, protocol=-1)
        if key not in seen:
             seen.add(key)
             result.append(elem)
    return result


if __name__ == '__main__': # Test
    from time import clock
    #uniqueDebug = False
    print "unique asserts" # ************************
    print "  Unstable unique..."
    assert unique( [] ) == []
    assert sorted(unique( [3, 2, 3, 1, 2] )) == [1, 2, 3]
    r = range(10**5)
    assert sorted(unique( set(r) )) == r
    s = "iterable dict based unique"
    assert sorted(unique(s)) == [' ','a','b','c','d','e','i','l','n','q','r','s','t','u']
    assert unique( [[1], [2]] ) == [[1], [2]]
    assert unique( ((1,),(2,)) ) == [(2,), (1,)]
    assert unique( ([1,],[2,]) ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]) ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]) ) == [[1+2j]]
    assert unique( [0] * 1000 ) == [0]

    print "  Stable unique..."
    assert unique( [1, 2, 3, 1, 2], True ) == [1, 2, 3]
    assert unique( [3, 2, 3, 1, 2], True ) == [3, 2, 1]
    r = range(10**5)
    assert sorted(unique( set(r), True )) == r
    s = "iterable dict based unique"
    assert unique(s, True) == ['i','t','e','r','a','b','l',' ','d','c','s','u','n','q']
    assert unique( [[1], [2]], True ) == [[1], [2]]
    assert unique( ((1,),(2,)), True ) == [(1,), (2,)]
    assert unique( ([1,],[2,]), True ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]), True ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]), True ) == [[1+2j]]
    assert unique( [0] * 1000, True ) == [0]
    assert unique( list("cbcab"), True ) == ['c', 'b', 'a']
    s = "bisect based unique"
    assert unique(s, True) == ['b', 'i', 's', 'e', 'c', 't', ' ', 'a', 'd', 'u', 'n', 'q']

    print "  unique timings:"
    #uniqueDebug = True
    print "   with a set:"
    l = range(10**5) * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with a sort/bisect:"
    l = [[e] for e in range(10**4)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with the brute force algorithm:"
    l = [[complex(e,e)] for e in range(3*10**2)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)
    print

This unique can be stable or unstable, this means that it can keep the original order of elements or not. Unstable allows to use a faster code. A xunique function can be useful too, sometimes.

1 comment

james turner 16 years, 4 months ago  # | flag

Old Way. I just ran your results with set(), and saw the conditions under which a type error occurs, which leads me to believe set uses the same underlying hashing the python dict type uses, since I ran into the same error when I did it the old way:

def unique(seq):
    return dict.fromkeys(seq).keys()

The performance is so similar to set() that there must be a lot of shared code on the C side (2.13 seconds versus 1.57 or so in a test I ran). I'm sure the new set() stuff is a bit simpler and the python side is definitely cleaner.

Interesting stuff--thanks.