from heapq import heapify, heappop, heappush from itertools import islice, cycle from tempfile import gettempdir import os def merge(chunks,key=None): if key is None: key = lambda x : x values = [] for index, chunk in enumerate(chunks): try: iterator = iter(chunk) value = iterator.next() except StopIteration: try: chunk.close() os.remove(chunk.name) chunks.remove(chunk) except: pass else: heappush(values,((key(value),index,value,iterator,chunk))) while values: k, index, value, iterator, chunk = heappop(values) yield value try: value = iterator.next() except StopIteration: try: chunk.close() os.remove(chunk.name) chunks.remove(chunk) except: pass else: heappush(values,(key(value),index,value,iterator,chunk)) def batch_sort(input,output,key=None,buffer_size=32000,tempdirs=[]): if not tempdirs: tempdirs.append(gettempdir()) input_file = file(input,'rb',64*1024) try: input_iterator = iter(input_file) chunks = [] try: for tempdir in cycle(tempdirs): current_chunk = list(islice(input_iterator,buffer_size)) if current_chunk: current_chunk.sort(key=key) output_chunk = file(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024) output_chunk.writelines(current_chunk) output_chunk.flush() output_chunk.seek(0) chunks.append(output_chunk) else: break except: for chunk in chunks: try: chunk.close() os.remove(chunk.name) except: pass if output_chunk not in chunks: try: output_chunk.close() os.remove(output_chunk.name) except: pass return finally: input_file.close() output_file = file(output,'wb',64*1024) try: output_file.writelines(merge(chunks,key)) finally: for chunk in chunks: try: chunk.close() os.remove(chunk.name) except: pass output_file.close() if __name__ == '__main__': import optparse parser = optparse.OptionParser() parser.add_option( '-b','--buffer', dest='buffer_size', type='int',default=32000, help='''Size of the line buffer. The file to sort is divided into chunks of that many lines. Default : 32,000 lines.''' ) parser.add_option( '-k','--key', dest='key', help='''Python expression used to compute the key for each line, "lambda line:" is prepended.\n Example : -k "line[5:10]". By default, the whole line is the key.''' ) parser.add_option( '-t','--tempdir', dest='tempdirs', action='append', default=[], help='''Temporary directory to use. You might get performance improvements if the temporary directory is not on the same physical disk than the input and output directories. You can even try providing multiples directories on differents physical disks. Use multiple -t options to do that.''' ) parser.add_option( '-p','--psyco', dest='psyco', action='store_true', default=False, help='''Use Psyco.''' ) options,args = parser.parse_args() if options.key: options.key = eval('lambda line : (%s)'%options.key) if options.psyco: import psyco psyco.full() batch_sort(args[0],args[1],options.key,options.buffer_size,options.tempdirs)