Natural string sorting « Python recipes

Sorts strings in a way that seems natural to humans. If the strings contain integers, then the integers are ordered numerically. For example, sorts ['Team 11', 'Team 3', 'Team 1'] into the order ['Team 1', 'Team 3', 'Team 11'].

      # ---------------------------------------------------------
# natsort.py: Natural string sorting.
# ---------------------------------------------------------

# By Seo Sanghyeon.  Some changes by Connelly Barnes.

def try_int(s):
    "Convert to integer if possible."
    try: return int(s)
    except: return s

def natsort_key(s):
    "Used internally to get a tuple by which s is sorted."
    import re
    return map(try_int, re.findall(r'(\d+|\D+)', s))

def natcmp(a, b):
    "Natural string comparison, case sensitive."
    return cmp(natsort_key(a), natsort_key(b))

def natcasecmp(a, b):
    "Natural string comparison, ignores case."
    return natcmp(a.lower(), b.lower())

def natsort(seq, cmp=natcmp):
    "In-place natural string sort."
    seq.sort(cmp)
    
def natsorted(seq, cmp=natcmp):
    "Returns a copy of seq, sorted by natural string sort."
    import copy
    temp = copy.copy(seq)
    natsort(temp, cmp)
    return temp

      

You can use this code to sort tarball filenames:

>>> natsorted(['ver-1.3.12', 'ver-1.3.3', 'ver-1.2.5', 'ver-1.2.15', 'ver-1.2.3', 'ver-1.2.1'])
['ver-1.2.1', 'ver-1.2.3', 'ver-1.2.5', 'ver-1.2.15', 'ver-1.3.3', 'ver-1.3.12']

Chemical elements:

>>> natsorted(['C1H2', 'C1H4', 'C2H2', 'C2H6', 'C2N', 'C3H6'])
['C1H2', 'C1H4', 'C2H2', 'C2H6', 'C2N', 'C3H6']

Teams:

>>> natsorted(['Team 101', 'Team 58', 'Team 30', 'Team 1'])
['Team 1', 'Team 30', 'Team 58', 'Team 101']

Pass natcasecmp as a second argument for case-insensitive sorting:

>>> natsorted(['a5', 'A7', 'a15', 'a9', 'A8'], natcasecmp)
['a5', 'A7', 'A8', 'a9', 'a15']

Tags: text

8 comments

David Klaffenbach 19 years, 7 months ago # | flag

How about a decorate, sort, undecorate technique? Here's an alternative approach that also makes use of re but may be faster by using the decorate, sort, undecorate idiom:

import re
digitsre=re.compile(r'\d+')         # finds groups of digits
D_LEN=3

def _decor(s):
    '''decorate function for sorting alphanumeric strings naturally'''
    return digitsre.sub(lambda s: str(len(s.group())).zfill(D_LEN)+s.group(),s)

def _rem_len(s):
    '''sub function for undecor - removes leading length digits'''
    return s.group()[D_LEN:]

def _undecor(s):
    '''undecorate function for sorting alpha strings naturally'''
    return digitsre.sub(_rem_len,s)

def natsort(list_):
    '''sort a list in natural order'''
    tmp=[_decor(s) for s in list_]
    tmp.sort()
    return [_undecor(s) for s in tmp]

def natsort24(list_):
    '''Python 2.4 version'''
    return [_undecor(s) for s in sorted([_decor(s) for s in list_])]

The re.sub method is used to prepend every numeric group with it's length, so 1 becomes 11, 25 becomes 325 and so on. This is done as a decorate step and the list can then be sorted with the standard sort() function (or sorted() in Python 2.4). Then the list is undecorated and left in sorted order.

John Mitchell 18 years, 10 months ago # | flag

short, modern alternative code. This uses newer features, like List Comprehensions [ http://www.network-theory.co.uk/docs/pytut/tut_38.html ] to make the code very small and fast.

import re

# dictionary: keys will be "natural" sorted: 1,2,3...9,10,11...
#
zoot = {'indy1:11':'beer', 'indy12:11':'coffee', 'indy2:11':'doughnut'}

# list each item, prefixed with a number
# based on its order
#
zoot    = [ ( int(re.search('\d+', key).group(0)), key, value )
            for key,value in zoot.items() ]
zoot.sort()
print zoot


# undecorate:
#
zoot    = [ item[1:] for item in zoot ]
print zoot

The output is the original data, but sorted in natural order:

[('indy1:11', 'beer'), ('indy2:11', 'doughnut'), ('indy12:11', 'coffee')]

r8qyfhp02 18 years, 2 months ago # | flag

simplified version of list comprehensions code. For sorting lists:

def natsort(list_):
    # decorate
    tmp = [ (int(re.search('\d+', i).group(0)), i) for i in list_ ]
    tmp.sort()
    # undecorate
    return [ i[1] for i in tmp ]

Romain Dartigues 15 years, 6 months ago # | flag

Simple version not relying on the re module

def keynat(string):
    r'''A natural sort helper function for sort() and sorted()
    without using regular expression.

    >>> items = ('Z', 'a', '10', '1', '9')
    >>> sorted(items)
    ['1', '10', '9', 'Z', 'a']
    >>> sorted(items, key=keynat)
    ['1', '9', '10', 'Z', 'a']
    '''
    r = []
    for c in string:
        try:
            c = int(c)
            try: r[-1] = r[-1] * 10 + c
            except: r.append(c)
        except:
            r.append(c)
    return r

Compability (tested with: python 1.5.1, 2.2.1, 2.3.4, 2.4.3, 2.5.2).

def keynat_cmp(a, b):
    r'''Compability for Python < 2.4.

    >>> items = ['Z', 'a', '10', '1', '9']
    >>> items.sort(), items
    (None, ['1', '10', '9', 'Z', 'a'])
    >>> items.sort(keynat_cmp), items
    (None, ['1', '9', '10', 'Z', 'a'])
    '''
    return cmp(keynat(a), keynat(b))

Performances tested with timeit shows no signifiant differences (slightly faster however) with the re versions on small and medium lists. Similar solutions on: http://www.davekoelle.com/alphanum.html

Jeremy Sproat 15 years, 3 months ago # | flag

Here's a version I've been using which fully exploits list comprehensions. In addition to converting strings of digits to ints, it also allows case-insensitive sorting.

import re,sys;
def keynat(l):
    '''normalizes a path for natural sorting'''
    return [
        [
            pp[0] if p[0] else int(pp[1]) 
            for pp 
            in re.findall(r'(\\D+)|(\\d+)',p)
            ] 
        for p 
        in l.strip().split('/').lower()
        ]
print sorted(lines, key=natkey)

For each line, it splits it along the '/' character, which guarantees that a directory name always appears before the contents of the directory (i.e. in case a filename starts with an ordinal lower than 0x20). Then, for each path part, it breaks it into a list of digits and non-digits:

In [2]: natkey('/home/sproaticus/Images/London 213.jpeg')
Out[2]: [[], ['home'], ['sproaticus'], ['images'], ['london ', 213, '.jpeg']]

The primary benefit I find with this method is that it avoids use of try/except statements, and so lends itself well to oneliners for command line usage:

find . | python -c "import re,sys;print ''.join(sorted(sys.stdin.readlines(), key=lambda l:[[pp[0] if pp[0] else int(pp[1]) for pp in re.findall(r'(\\D+)|(\\d+)',p)] for p in l.strip().lower().split('/')]))" > dir.txt

I haven't tested this method's performance, but I suspect it'd be a little bit slower than the other methods due to all the slicing and dicing.

(Tested in Python 2.5.2)

paul clinch 14 years, 2 months ago # | flag

A similar function, but caseless comparison and avoiding exceptions. It handles digits much more quickly ( approx. 2-3 times ) in cpython 2.5, 2.6.

def keynat(string):
    r'''A natural sort helper function for sort() and sorted()
    without using regular expressions or exceptions.

    >>> items = ('Z', 'a', '10th', '1st', '9')
    >>> sorted(items)
    ['10th', '1st', '9', 'Z', 'a']
    >>> sorted(items, key=keynat)
    ['1st', '9', '10th', 'a', 'Z']    
    '''
    it = type(1)
    r = []
    for c in string:
        if c.isdigit():
            d = int(c)
            if r and type( r[-1] ) == it: 
                r[-1] = r[-1] * 10 + d
            else: 
                r.append(d)
        else:
            r.append(c.lower())
    return r

Timo Reunanen 13 years, 10 months ago # | flag

Here is my version of natural sort.

import re
import unicodedata

def natural(key):
    """natural(key)
    usage:
    >>> sorted(unsorted, key=natural)
    >>> unsorted.sort(key=natural)

    if key is unicode, it simplifies key to ascii using unicodedata.normalize.
    """

    if isinstance(key, basestring):
        if isinstance(key, unicode):
            key=unicodedata.normalize('NFKD', key.lower()).encode('ascii', 'ignore')
        else:
            key=key.lower()

        return [int(n) if n else s for n,s in re.findall(r'(\d+)|(\D+)', key)]
    else:
        return key

Spondon 13 years, 2 months ago # | flag

Here is the original code equipped to do reverse sorting:

# ---------------------------------------------------------
# natsort.py: Natural string sorting.
# ---------------------------------------------------------

# By Seo Sanghyeon.  Some changes by Connelly Barnes & Spondon Saha

def try_int(s):
    "Convert to integer if possible."
    try: return int(s)
    except: return s

def natsort_key(s):
    "Used internally to get a tuple by which s is sorted."
    import re
    return map(try_int, re.findall(r'(\d+|\D+)', s))

def natcmp(a, b):
    "Natural string comparison, case sensitive."
    return cmp(natsort_key(a), natsort_key(b))

def natcasecmp(a, b):
    "Natural string comparison, ignores case."
    return natcmp(a.lower(), b.lower())

def natsort(seq, cmp=natcmp, reverse=False):
    "In-place natural string sort."
    if reverse:
        seq.sort(cmp, reverse=True)
    else:
        seq.sort(cmp)

def natsorted(seq, cmp=natcmp, reverse=False):
    "Returns a copy of seq, sorted by natural string sort."
    import copy
    temp = copy.copy(seq)
    natsort(temp, cmp, reverse)
    return temp

◄	Python recipes (4591)	►
◄	Connelly Barnes's recipes (7)	►

Natural string sorting (Python recipe) by Connelly Barnes
ActiveState Code (http://code.activestate.com/recipes/285264/)

8 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Natural string sorting (Python recipe) by Connelly Barnes ActiveState Code (http://code.activestate.com/recipes/285264/)

8 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Natural string sorting (Python recipe) by Connelly Barnes
ActiveState Code (http://code.activestate.com/recipes/285264/)