1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import os
class FileSort(object):
def __init__(self, inFile, outFile=None, splitSize=20):
""" split size (in MB) """
self._inFile = inFile
if outFile is None:
self._outFile = inFile
else:
self._outFile = outFile
self._splitSize = splitSize * 1000000
self.setKeyExtractMethod()
def setKeyExtractMethod(self, keyExtractMethod=None):
""" key extract from line for sort method:
def f(line):
return line[1:3], line[5:10]
"""
if keyExtractMethod is None:
self._getKey = lambda line: line
else:
self._getKey = keyExtractMethod
def sort(self):
files = self._splitFile()
if files is None:
""" file size <= self._splitSize """
self._sortFile(self._inFile, self._outFile)
return
for fn in files:
self._sortFile(fn)
self._mergeFiles(files)
self._deleteFiles(files)
def _sortFile(self, fileName, outFile=None):
lines = open(fileName).readlines()
get_key = self._getKey
data = [(get_key(line), line) for line in lines if line!='']
data.sort()
lines = [line[1] for line in data]
if outFile is not None:
open(outFile, 'w').write(''.join(lines))
else:
open(fileName, 'w').write(''.join(lines))
def _splitFile(self):
totalSize = os.path.getsize(self._inFile)
if totalSize <= self._splitSize:
# do not split file, the file isn't so big.
return None
fileNames = []
fn,e = os.path.splitext(self._inFile)
f = open(self._inFile)
try:
i = size = 0
lines = []
for line in f:
size += len(line)
lines.append(line)
if size >= self._splitSize:
i += 1
tmpFile = fn + '.%03d' % i
fileNames.append(tmpFile)
open(tmpFile,'w').write(''.join(lines))
del lines[:]
size = 0
if size > 0:
tmpFile = fn + '.%03d' % (i+1)
fileNames.append(tmpFile)
open(tmpFile,'w').write(''.join(lines))
return fileNames
finally:
f.close()
def _mergeFiles(self, files):
files = [open(f) for f in files]
lines = []
keys = []
for f in files:
l = f.readline()
lines.append(l)
keys.append(self._getKey(l))
buff = []
buffSize = self._splitSize/2
append = buff.append
output = open(self._outFile,'w')
try:
key = min(keys)
index = keys.index(key)
get_key = self._getKey
while 1:
while key == min(keys):
append(lines[index])
if len(buff) > buffSize:
output.write(''.join(buff))
del buff[:]
line = files[index].readline()
if not line:
files[index].close()
del files[index]
del keys[index]
del lines[index]
break
key = get_key(line)
keys[index] = key
lines[index] = line
if len(files)==0:
break
# key != min(keys), see for new index (file)
key = min(keys)
index = keys.index(key)
if len(buff)>0:
output.write(''.join(buff))
finally:
output.close()
def _deleteFiles(self, files):
for fn in files:
os.remove(fn)
def sort(inFileName, outFileName=None, getKeyMethod=None):
fs = FileSort(inFileName, outFileName)
if getKeyMethod is not None:
fs.setKeyExtractMethod(getKeyMethod)
fs.sort()
fs = None
|
I didn't find such sorting algorythmus. I'm not shure about the performance, maybe somebody has better/faster solution.
Tags: algorithms
Wirth is GOD ;). Early, I read about sorting sequences placed on tapes, on "Algorithms + Data Structures = Programs" by Niklaus Wirth; here we have speed -optimized one-step version with more disk and memory requirements.
ups ... to be clear, I wanted to say, that I didn't find this algorythmus implemented in python :-)
You know, there is a category listing on the left side of this page which shows you how to spell "algorithm".
Does work, but uses lots of memory. I have tested this algorithm with a 280 MB ASCII file.
The idea is compelling: for huge files do not sort them in one step, but break the file into chunks, sort each chunk separately and merge the files.
The result is correct, but the merge step uses a hugh amount of memory. Split and sort does not use a lot of memory, but merge does, around 700 MB. This is more than loading the whole file into memory and sort then.
As an alternative I have used the following 3 liner
which has used 'only' 400MB RAM to sort a 280 MB ASCII file.
I don't understand from the code why merge is using so much RAM. Otherwise the algorithm would be great.
Thanks - this was exactly what I was after. In regards to using a lot of memory - I'm not a computer scientist but I assume that it quite depends on the file you are sorting, specifically - how large the file you are sorting is.
For me, this sort worked on a 16Gb file using ~8Gb of RAM taking ~13 cpu hours whereas other sorts may be more memory efficient for smaller files but not for larger ones.