Welcome, guest | Sign In | My Account | Store | Cart
from itertools import groupby # for unique function.
from bisect import bisect_left, insort_left # for unique function.


def unique(seq, stable=False):
    """unique(seq, stable=False): return a list of the elements in seq in arbitrary
    order, but without duplicates.
    If stable=True it keeps the original element order (using slower algorithms)."""
    # Developed from Tim Peters version:
    #   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560

    #if uniqueDebug and len(str(seq))<50: print "Input:", seq # For debugging.

    # Special case of an empty s:
    if not seq: return []

    # if it's a set:
    if isinstance(seq, set): return list(seq)

    if stable:
        # Try with a set:
        seqSet= set()
        result = []
        try:
            for e in seq:
                if e not in seqSet:
                    result.append(e)
                    seqSet.add(e)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Stable, set."
            return result

        # Since you can't hash all elements, use a bisection on sorted elements
        result = []
        sortedElem = []
        try:
            for elem in seq:
                pos = bisect_left(sortedElem, elem)
                if pos >= len(sortedElem) or sortedElem[pos] != elem:
                    insort_left(sortedElem, elem)
                    result.append(elem)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Stable, bisect."
            return result
    else: # Not stable
        # Try using a set first, because it's the fastest and it usually works
        try:
            u = set(seq)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Unstable, set."
            return list(u)

        # Elements can't be hashed, so bring equal items together with a sort and
        # remove them out in a single pass.
        try:
            t = sorted(seq)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Unstable, sorted."
            return [elem for elem,group in groupby(t)]

    # Brute force:
    result = []
    for elem in seq:
        if elem not in result:
            result.append(elem)
    #if uniqueDebug: print "Brute force (" + ("Unstable","Stable")[stable] + ")."
    return result


# Following a suggestion from Alex Martelli, sometimes this uniquePick
# is faster for more than about 300 unsortable and unhashable elements:

from cPickle import dumps

def uniquePick(seq):
    result = []
    seen = set()
    for elem in seq:
        key = dumps(elem, protocol=-1)
        if key not in seen:
             seen.add(key)
             result.append(elem)
    return result


if __name__ == '__main__': # Test
    from time import clock
    #uniqueDebug = False
    print "unique asserts" # ************************
    print "  Unstable unique..."
    assert unique( [] ) == []
    assert sorted(unique( [3, 2, 3, 1, 2] )) == [1, 2, 3]
    r = range(10**5)
    assert sorted(unique( set(r) )) == r
    s = "iterable dict based unique"
    assert sorted(unique(s)) == [' ','a','b','c','d','e','i','l','n','q','r','s','t','u']
    assert unique( [[1], [2]] ) == [[1], [2]]
    assert unique( ((1,),(2,)) ) == [(2,), (1,)]
    assert unique( ([1,],[2,]) ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]) ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]) ) == [[1+2j]]
    assert unique( [0] * 1000 ) == [0]

    print "  Stable unique..."
    assert unique( [1, 2, 3, 1, 2], True ) == [1, 2, 3]
    assert unique( [3, 2, 3, 1, 2], True ) == [3, 2, 1]
    r = range(10**5)
    assert sorted(unique( set(r), True )) == r
    s = "iterable dict based unique"
    assert unique(s, True) == ['i','t','e','r','a','b','l',' ','d','c','s','u','n','q']
    assert unique( [[1], [2]], True ) == [[1], [2]]
    assert unique( ((1,),(2,)), True ) == [(1,), (2,)]
    assert unique( ([1,],[2,]), True ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]), True ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]), True ) == [[1+2j]]
    assert unique( [0] * 1000, True ) == [0]
    assert unique( list("cbcab"), True ) == ['c', 'b', 'a']
    s = "bisect based unique"
    assert unique(s, True) == ['b', 'i', 's', 'e', 'c', 't', ' ', 'a', 'd', 'u', 'n', 'q']

    print "  unique timings:"
    #uniqueDebug = True
    print "   with a set:"
    l = range(10**5) * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with a sort/bisect:"
    l = [[e] for e in range(10**4)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with the brute force algorithm:"
    l = [[complex(e,e)] for e in range(3*10**2)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)
    print

History

  • revision 3 (18 years ago)
  • previous revisions are not available