big file sorting « Python recipes

      import os

class FileSort(object):
    def __init__(self, inFile, outFile=None, splitSize=20):
        """ split size (in MB) """
        self._inFile = inFile
        if outFile is None:
            self._outFile = inFile
        else:
            self._outFile = outFile
                    
        self._splitSize = splitSize * 1000000
        self.setKeyExtractMethod()

        
    def setKeyExtractMethod(self, keyExtractMethod=None):
        """ key extract from line for sort method:
            def f(line):
                return line[1:3], line[5:10]
        """                
        if keyExtractMethod is None:
            self._getKey = lambda line: line
        else:
            self._getKey = keyExtractMethod

    def sort(self):
        files = self._splitFile()

        if files is None:
            """ file size <= self._splitSize """            
            self._sortFile(self._inFile, self._outFile)
            return

        for fn in files:
            self._sortFile(fn)
            
        self._mergeFiles(files)
        self._deleteFiles(files)

        
    def _sortFile(self, fileName, outFile=None):
        lines = open(fileName).readlines()
        get_key = self._getKey
        data = [(get_key(line), line) for line in lines if line!='']
        data.sort()
        lines = [line[1] for line in data]        
        if outFile is not None:
            open(outFile, 'w').write(''.join(lines))
        else:
            open(fileName, 'w').write(''.join(lines))
    
    

    def _splitFile(self):
        totalSize = os.path.getsize(self._inFile)
        if totalSize <= self._splitSize:
            # do not split file, the file isn't so big.
            return None

        fileNames = []            
                
        fn,e = os.path.splitext(self._inFile)
        f = open(self._inFile)
        try:
            i = size = 0
            lines = []
            for line in f:
                size += len(line)
                lines.append(line)
                if size >= self._splitSize:
                    i += 1
                    tmpFile = fn + '.%03d' % i
                    fileNames.append(tmpFile)
                    open(tmpFile,'w').write(''.join(lines))
                    del lines[:]
                    size = 0

                                                       
            if size > 0:
                tmpFile = fn + '.%03d' % (i+1)
                fileNames.append(tmpFile)
                open(tmpFile,'w').write(''.join(lines))
                
            return fileNames
        finally:
            f.close()

    def _mergeFiles(self, files):
        files = [open(f) for f in files]
        lines = []
        keys = []
        
        for f in files:
            l = f.readline()        
            lines.append(l)
            keys.append(self._getKey(l))

        buff = []
        buffSize = self._splitSize/2
        append = buff.append
        output = open(self._outFile,'w')
        try:
            key = min(keys)
            index = keys.index(key)
            get_key = self._getKey
            while 1:
                while key == min(keys):
                    append(lines[index])
                    if len(buff) > buffSize:
                        output.write(''.join(buff))
                        del buff[:]
                            
                    line = files[index].readline()
                    if not line:
                        files[index].close()
                        del files[index]
                        del keys[index]
                        del lines[index]
                        break
                    key = get_key(line)
                    keys[index] = key
                    lines[index] = line
        
                if len(files)==0:
                    break
                # key != min(keys), see for new index (file)
                key = min(keys)
                index = keys.index(key)

            if len(buff)>0:
                output.write(''.join(buff))
        finally:    
            output.close()

    def _deleteFiles(self, files):   
        for fn in files:
            os.remove(fn)        
        

            
def sort(inFileName, outFileName=None, getKeyMethod=None):
    fs = FileSort(inFileName, outFileName)
    if getKeyMethod is not None:
        fs.setKeyExtractMethod(getKeyMethod)

    fs.sort()
    fs = None

      

I didn't find such sorting algorythmus. I'm not shure about the performance, maybe somebody has better/faster solution.

Tags: algorithms

6 comments

Denis Barmenkov 18 years, 10 months ago # | flag

Wirth is GOD ;). Early, I read about sorting sequences placed on tapes, on "Algorithms + Data Structures = Programs" by Niklaus Wirth; here we have speed -optimized one-step version with more disk and memory requirements.

Tomasz Bieruta (author) 18 years, 10 months ago # | flag

ups ... to be clear, I wanted to say, that I didn't find this algorythmus implemented in python :-)

Josiah Carlson 18 years, 10 months ago # | flag

You know, there is a category listing on the left side of this page which shows you how to spell "algorithm".

Martin Schnell 18 years, 2 months ago # | flag

Does work, but uses lots of memory. I have tested this algorithm with a 280 MB ASCII file.

The idea is compelling: for huge files do not sort them in one step, but break the file into chunks, sort each chunk separately and merge the files.

The result is correct, but the merge step uses a hugh amount of memory. Split and sort does not use a lot of memory, but merge does, around 700 MB. This is more than loading the whole file into memory and sort then.

As an alternative I have used the following 3 liner

lines = inFile.readlines()
lines.sort()
map(outFile.write, lines)

which has used 'only' 400MB RAM to sort a 280 MB ASCII file.

I don't understand from the code why merge is using so much RAM. Otherwise the algorithm would be great.

Tomasz Bieruta (author) 16 years, 11 months ago # | flag

Hi,
I found some improvements :-)
just before I save the file (in _splitFile() method) I can also sort the lines:
   ...
          fileNames.append(tmpFile)
          data = self._sortRecords(lines)  -- here
          open(tmpFile,'w').write(''.join(data))
          del lines[:]
          data = None
          size = 0

  if size > 0:
      tmpFile = fn + '.%03d' % (i+1)
      fileNames.append(tmpFile)
      data = self._sortRecords(lines)    -- and here
      open(tmpFile,'w').write(''.join(data))
      data = None
  ....

the _sortRecords() method looks now like:

    def _sortRecords(self, records):
        get_key = self._getKey
        data = [(get_key(line), line) for line in records if line!='']
        data.sort()
        return [line[1] for line in data]

in the sort() method I also commented out the code lines:

   ...
   #for fn in files:
   #    self._sortFile(fn)
   ...

And finally I change the code line

   ...
   buffSize = self._splitSize/(2*len(lines[0]))
   ...

in _mergeFiles() method - don't remember why :-(

In my tests the memory stays on same level, with the file size I can optimize memory/speed. In your case when I tried 750 MB file I got memory problems.

regards
Tomasz Bieruta

Emily 12 years, 7 months ago # | flag

Thanks - this was exactly what I was after. In regards to using a lot of memory - I'm not a computer scientist but I assume that it quite depends on the file you are sorting, specifically - how large the file you are sorting is.

For me, this sort worked on a 16Gb file using ~8Gb of RAM taking ~13 cpu hours whereas other sorts may be more memory efficient for smaller files but not for larger ones.

◄	Python recipes (4591)	►
◄	Tomasz Bieruta's recipes (1)	►

big file sorting (Python recipe) by Tomasz Bieruta
ActiveState Code (http://code.activestate.com/recipes/415581/)

6 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

big file sorting (Python recipe) by Tomasz Bieruta ActiveState Code (http://code.activestate.com/recipes/415581/)

6 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

big file sorting (Python recipe) by Tomasz Bieruta
ActiveState Code (http://code.activestate.com/recipes/415581/)