This is just a rewrite of Recipe 466302 "Sorting big files the Python 2.4 way", taking advantage of heapq.merge, context managers, and other niceties of newer Python versions. It can be used to sort very large files (millions of records) in Python. No record termination character is required, hence a record may contain embedded binary data, newlines, etc. You can specify how many temporary files to use and where they are located.
| Python |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | # based on Recipe 466302: Sorting big files the Python 2.4 way
# by Nicolas Lehuen
import os
from tempfile import gettempdir
from itertools import islice, cycle
from collections import namedtuple
import heapq
Keyed = namedtuple("Keyed", ["key", "obj"])
def merge(key=None, *iterables):
# based on code posted by Scott David Daniels in c.l.p.
# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
if key is None:
keyed_iterables = iterables
else:
keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)
for iterable in iterables]
for element in heapq.merge(*keyed_iterables):
yield element.obj
def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
if tempdirs is None:
tempdirs = []
if not tempdirs:
tempdirs.append(gettempdir())
chunks = []
try:
with open(input,'rb',64*1024) as input_file:
input_iterator = iter(input_file)
for tempdir in cycle(tempdirs):
current_chunk = list(islice(input_iterator,buffer_size))
if not current_chunk:
break
current_chunk.sort(key=key)
output_chunk = open(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024)
chunks.append(output_chunk)
output_chunk.writelines(current_chunk)
output_chunk.flush()
output_chunk.seek(0)
with open(output,'wb',64*1024) as output_file:
output_file.writelines(merge(key, *chunks))
finally:
for chunk in chunks:
try:
chunk.close()
os.remove(chunk.name)
except Exception:
pass
if __name__ == '__main__':
import optparse
parser = optparse.OptionParser()
parser.add_option(
'-b','--buffer',
dest='buffer_size',
type='int',default=32000,
help='''Size of the line buffer. The file to sort is
divided into chunks of that many lines. Default : 32,000 lines.'''
)
parser.add_option(
'-k','--key',
dest='key',
help='''Python expression used to compute the key for each
line, "lambda line:" is prepended.\n
Example : -k "line[5:10]". By default, the whole line is the key.'''
)
parser.add_option(
'-t','--tempdir',
dest='tempdirs',
action='append',
default=[],
help='''Temporary directory to use. You might get performance
improvements if the temporary directory is not on the same physical
disk than the input and output directories. You can even try
providing multiples directories on differents physical disks.
Use multiple -t options to do that.'''
)
parser.add_option(
'-p','--psyco',
dest='psyco',
action='store_true',
default=False,
help='''Use Psyco.'''
)
options,args = parser.parse_args()
if options.key:
options.key = eval('lambda line : (%s)'%options.key)
if options.psyco:
import psyco
psyco.full()
batch_sort(args[0],args[1],options.key,options.buffer_size,options.tempdirs)
|
Discussion
This is just a rewrite of Recipe 466302 "Sorting big files the Python 2.4 way", taking advantage of heapq.merge, context managers, and other niceties of newer Python versions.
Interface and command line usage are identical to the original recipe; most comments still apply too.


Comments
Like it - this could be pretty useful. Just one thing - shouldn't the default argument for tempdirs on line 25 be None then assigned to an [gettempdir()] as lists used in default arguments are static (not that it matters in this code as the function is executed once - but could avoid some headaches if the function was used elsewhere). Nice little script though!
Thanks - default argument fixed.
Sign in to comment