Python needs a unique function, this is my best try so far. It uses the faster algorithm (among five) for each situation.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | from itertools import groupby # for unique function.
from bisect import bisect_left, insort_left # for unique function.
def unique(seq, stable=False):
"""unique(seq, stable=False): return a list of the elements in seq in arbitrary
order, but without duplicates.
If stable=True it keeps the original element order (using slower algorithms)."""
# Developed from Tim Peters version:
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560
#if uniqueDebug and len(str(seq))<50: print "Input:", seq # For debugging.
# Special case of an empty s:
if not seq: return []
# if it's a set:
if isinstance(seq, set): return list(seq)
if stable:
# Try with a set:
seqSet= set()
result = []
try:
for e in seq:
if e not in seqSet:
result.append(e)
seqSet.add(e)
except TypeError:
pass # move on to the next method
else:
#if uniqueDebug: print "Stable, set."
return result
# Since you can't hash all elements, use a bisection on sorted elements
result = []
sortedElem = []
try:
for elem in seq:
pos = bisect_left(sortedElem, elem)
if pos >= len(sortedElem) or sortedElem[pos] != elem:
insort_left(sortedElem, elem)
result.append(elem)
except TypeError:
pass # Move on to the next method
else:
#if uniqueDebug: print "Stable, bisect."
return result
else: # Not stable
# Try using a set first, because it's the fastest and it usually works
try:
u = set(seq)
except TypeError:
pass # move on to the next method
else:
#if uniqueDebug: print "Unstable, set."
return list(u)
# Elements can't be hashed, so bring equal items together with a sort and
# remove them out in a single pass.
try:
t = sorted(seq)
except TypeError:
pass # Move on to the next method
else:
#if uniqueDebug: print "Unstable, sorted."
return [elem for elem,group in groupby(t)]
# Brute force:
result = []
for elem in seq:
if elem not in result:
result.append(elem)
#if uniqueDebug: print "Brute force (" + ("Unstable","Stable")[stable] + ")."
return result
# Following a suggestion from Alex Martelli, sometimes this uniquePick
# is faster for more than about 300 unsortable and unhashable elements:
from cPickle import dumps
def uniquePick(seq):
result = []
seen = set()
for elem in seq:
key = dumps(elem, protocol=-1)
if key not in seen:
seen.add(key)
result.append(elem)
return result
if __name__ == '__main__': # Test
from time import clock
#uniqueDebug = False
print "unique asserts" # ************************
print " Unstable unique..."
assert unique( [] ) == []
assert sorted(unique( [3, 2, 3, 1, 2] )) == [1, 2, 3]
r = range(10**5)
assert sorted(unique( set(r) )) == r
s = "iterable dict based unique"
assert sorted(unique(s)) == [' ','a','b','c','d','e','i','l','n','q','r','s','t','u']
assert unique( [[1], [2]] ) == [[1], [2]]
assert unique( ((1,),(2,)) ) == [(2,), (1,)]
assert unique( ([1,],[2,]) ) == [[1], [2]]
assert unique( ([1+2J],[2+1J],[1+2J]) ) == [[1+2j], [2+1j]]
assert unique( ([1+2J],[1+2J]) ) == [[1+2j]]
assert unique( [0] * 1000 ) == [0]
print " Stable unique..."
assert unique( [1, 2, 3, 1, 2], True ) == [1, 2, 3]
assert unique( [3, 2, 3, 1, 2], True ) == [3, 2, 1]
r = range(10**5)
assert sorted(unique( set(r), True )) == r
s = "iterable dict based unique"
assert unique(s, True) == ['i','t','e','r','a','b','l',' ','d','c','s','u','n','q']
assert unique( [[1], [2]], True ) == [[1], [2]]
assert unique( ((1,),(2,)), True ) == [(1,), (2,)]
assert unique( ([1,],[2,]), True ) == [[1], [2]]
assert unique( ([1+2J],[2+1J],[1+2J]), True ) == [[1+2j], [2+1j]]
assert unique( ([1+2J],[1+2J]), True ) == [[1+2j]]
assert unique( [0] * 1000, True ) == [0]
assert unique( list("cbcab"), True ) == ['c', 'b', 'a']
s = "bisect based unique"
assert unique(s, True) == ['b', 'i', 's', 'e', 'c', 't', ' ', 'a', 'd', 'u', 'n', 'q']
print " unique timings:"
#uniqueDebug = True
print " with a set:"
l = range(10**5) * 10
t = clock()
unique( l )
print " ", round(clock()-t, 2)
t = clock()
unique( l, True )
print " ", round(clock()-t, 2)
print " with a sort/bisect:"
l = [[e] for e in range(10**4)] * 10
t = clock()
unique( l )
print " ", round(clock()-t, 2)
t = clock()
unique( l, True )
print " ", round(clock()-t, 2)
print " with the brute force algorithm:"
l = [[complex(e,e)] for e in range(3*10**2)] * 10
t = clock()
unique( l )
print " ", round(clock()-t, 2)
t = clock()
unique( l, True )
print " ", round(clock()-t, 2)
print
|
This unique can be stable or unstable, this means that it can keep the original order of elements or not. Unstable allows to use a faster code. A xunique function can be useful too, sometimes.
Tags: algorithms
Old Way. I just ran your results with set(), and saw the conditions under which a type error occurs, which leads me to believe set uses the same underlying hashing the python dict type uses, since I ran into the same error when I did it the old way:
The performance is so similar to set() that there must be a lot of shared code on the C side (2.13 seconds versus 1.57 or so in a test I ran). I'm sure the new set() stuff is a bit simpler and the python side is definitely cleaner.
Interesting stuff--thanks.