Unique stable-unstable « Python recipes

Python needs a unique function, this is my best try so far. It uses the faster algorithm (among five) for each situation.

      from itertools import groupby # for unique function.
from bisect import bisect_left, insort_left # for unique function.


def unique(seq, stable=False):
    """unique(seq, stable=False): return a list of the elements in seq in arbitrary
    order, but without duplicates.
    If stable=True it keeps the original element order (using slower algorithms)."""
    # Developed from Tim Peters version:
    #   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560

    #if uniqueDebug and len(str(seq))<50: print "Input:", seq # For debugging.

    # Special case of an empty s:
    if not seq: return []

    # if it's a set:
    if isinstance(seq, set): return list(seq)

    if stable:
        # Try with a set:
        seqSet= set()
        result = []
        try:
            for e in seq:
                if e not in seqSet:
                    result.append(e)
                    seqSet.add(e)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Stable, set."
            return result

        # Since you can't hash all elements, use a bisection on sorted elements
        result = []
        sortedElem = []
        try:
            for elem in seq:
                pos = bisect_left(sortedElem, elem)
                if pos >= len(sortedElem) or sortedElem[pos] != elem:
                    insort_left(sortedElem, elem)
                    result.append(elem)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Stable, bisect."
            return result
    else: # Not stable
        # Try using a set first, because it's the fastest and it usually works
        try:
            u = set(seq)
        except TypeError:
            pass # move on to the next method
        else:
            #if uniqueDebug: print "Unstable, set."
            return list(u)

        # Elements can't be hashed, so bring equal items together with a sort and
        # remove them out in a single pass.
        try:
            t = sorted(seq)
        except TypeError:
            pass  # Move on to the next method
        else:
            #if uniqueDebug: print "Unstable, sorted."
            return [elem for elem,group in groupby(t)]

    # Brute force:
    result = []
    for elem in seq:
        if elem not in result:
            result.append(elem)
    #if uniqueDebug: print "Brute force (" + ("Unstable","Stable")[stable] + ")."
    return result


# Following a suggestion from Alex Martelli, sometimes this uniquePick
# is faster for more than about 300 unsortable and unhashable elements:

from cPickle import dumps

def uniquePick(seq):
    result = []
    seen = set()
    for elem in seq:
        key = dumps(elem, protocol=-1)
        if key not in seen:
             seen.add(key)
             result.append(elem)
    return result


if __name__ == '__main__': # Test
    from time import clock
    #uniqueDebug = False
    print "unique asserts" # ************************
    print "  Unstable unique..."
    assert unique( [] ) == []
    assert sorted(unique( [3, 2, 3, 1, 2] )) == [1, 2, 3]
    r = range(10**5)
    assert sorted(unique( set(r) )) == r
    s = "iterable dict based unique"
    assert sorted(unique(s)) == [' ','a','b','c','d','e','i','l','n','q','r','s','t','u']
    assert unique( [[1], [2]] ) == [[1], [2]]
    assert unique( ((1,),(2,)) ) == [(2,), (1,)]
    assert unique( ([1,],[2,]) ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]) ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]) ) == [[1+2j]]
    assert unique( [0] * 1000 ) == [0]

    print "  Stable unique..."
    assert unique( [1, 2, 3, 1, 2], True ) == [1, 2, 3]
    assert unique( [3, 2, 3, 1, 2], True ) == [3, 2, 1]
    r = range(10**5)
    assert sorted(unique( set(r), True )) == r
    s = "iterable dict based unique"
    assert unique(s, True) == ['i','t','e','r','a','b','l',' ','d','c','s','u','n','q']
    assert unique( [[1], [2]], True ) == [[1], [2]]
    assert unique( ((1,),(2,)), True ) == [(1,), (2,)]
    assert unique( ([1,],[2,]), True ) == [[1], [2]]
    assert unique( ([1+2J],[2+1J],[1+2J]), True ) == [[1+2j], [2+1j]]
    assert unique( ([1+2J],[1+2J]), True ) == [[1+2j]]
    assert unique( [0] * 1000, True ) == [0]
    assert unique( list("cbcab"), True ) == ['c', 'b', 'a']
    s = "bisect based unique"
    assert unique(s, True) == ['b', 'i', 's', 'e', 'c', 't', ' ', 'a', 'd', 'u', 'n', 'q']

    print "  unique timings:"
    #uniqueDebug = True
    print "   with a set:"
    l = range(10**5) * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with a sort/bisect:"
    l = [[e] for e in range(10**4)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)

    print "   with the brute force algorithm:"
    l = [[complex(e,e)] for e in range(3*10**2)] * 10
    t = clock()
    unique( l )
    print "   ", round(clock()-t, 2)
    t = clock()
    unique( l, True )
    print "   ", round(clock()-t, 2)
    print

      

This unique can be stable or unstable, this means that it can keep the original order of elements or not. Unstable allows to use a faster code. A xunique function can be useful too, sometimes.

Tags: algorithms

1 comment

james turner 18 years, 8 months ago # | flag

Old Way. I just ran your results with set(), and saw the conditions under which a type error occurs, which leads me to believe set uses the same underlying hashing the python dict type uses, since I ran into the same error when I did it the old way:

def unique(seq):
    return dict.fromkeys(seq).keys()

The performance is so similar to set() that there must be a lot of shared code on the C side (2.13 seconds versus 1.57 or so in a test I ran). I'm sure the new set() stuff is a bit simpler and the python side is definitely cleaner.

Interesting stuff--thanks.

◄	Python recipes (4591)	►
◄	bearophile -'s recipes (15)	►

Unique stable-unstable (Python recipe) by bearophile -
ActiveState Code (http://code.activestate.com/recipes/438599/)

1 comment

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Unique stable-unstable (Python recipe) by bearophile - ActiveState Code (http://code.activestate.com/recipes/438599/)

1 comment

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Unique stable-unstable (Python recipe) by bearophile -
ActiveState Code (http://code.activestate.com/recipes/438599/)