1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147 | import os
class FileSort(object):
def __init__(self, inFile, outFile=None, splitSize=20):
""" split size (in MB) """
self._inFile = inFile
if outFile is None:
self._outFile = inFile
else:
self._outFile = outFile
self._splitSize = splitSize * 1000000
self.setKeyExtractMethod()
def setKeyExtractMethod(self, keyExtractMethod=None):
""" key extract from line for sort method:
def f(line):
return line[1:3], line[5:10]
"""
if keyExtractMethod is None:
self._getKey = lambda line: line
else:
self._getKey = keyExtractMethod
def sort(self):
files = self._splitFile()
if files is None:
""" file size <= self._splitSize """
self._sortFile(self._inFile, self._outFile)
return
for fn in files:
self._sortFile(fn)
self._mergeFiles(files)
self._deleteFiles(files)
def _sortFile(self, fileName, outFile=None):
lines = open(fileName).readlines()
get_key = self._getKey
data = [(get_key(line), line) for line in lines if line!='']
data.sort()
lines = [line[1] for line in data]
if outFile is not None:
open(outFile, 'w').write(''.join(lines))
else:
open(fileName, 'w').write(''.join(lines))
def _splitFile(self):
totalSize = os.path.getsize(self._inFile)
if totalSize <= self._splitSize:
# do not split file, the file isn't so big.
return None
fileNames = []
fn,e = os.path.splitext(self._inFile)
f = open(self._inFile)
try:
i = size = 0
lines = []
for line in f:
size += len(line)
lines.append(line)
if size >= self._splitSize:
i += 1
tmpFile = fn + '.%03d' % i
fileNames.append(tmpFile)
open(tmpFile,'w').write(''.join(lines))
del lines[:]
size = 0
if size > 0:
tmpFile = fn + '.%03d' % (i+1)
fileNames.append(tmpFile)
open(tmpFile,'w').write(''.join(lines))
return fileNames
finally:
f.close()
def _mergeFiles(self, files):
files = [open(f) for f in files]
lines = []
keys = []
for f in files:
l = f.readline()
lines.append(l)
keys.append(self._getKey(l))
buff = []
buffSize = self._splitSize/2
append = buff.append
output = open(self._outFile,'w')
try:
key = min(keys)
index = keys.index(key)
get_key = self._getKey
while 1:
while key == min(keys):
append(lines[index])
if len(buff) > buffSize:
output.write(''.join(buff))
del buff[:]
line = files[index].readline()
if not line:
files[index].close()
del files[index]
del keys[index]
del lines[index]
break
key = get_key(line)
keys[index] = key
lines[index] = line
if len(files)==0:
break
# key != min(keys), see for new index (file)
key = min(keys)
index = keys.index(key)
if len(buff)>0:
output.write(''.join(buff))
finally:
output.close()
def _deleteFiles(self, files):
for fn in files:
os.remove(fn)
def sort(inFileName, outFileName=None, getKeyMethod=None):
fs = FileSort(inFileName, outFileName)
if getKeyMethod is not None:
fs.setKeyExtractMethod(getKeyMethod)
fs.sort()
fs = None
|
Comments
Wirth is GOD ;). Early, I read about sorting sequences placed on tapes, on "Algorithms + Data Structures = Programs" by Niklaus Wirth; here we have speed -optimized one-step version with more disk and memory requirements.
ups ... to be clear, I wanted to say, that I didn't find this algorythmus implemented in python :-)
You know, there is a category listing on the left side of this page which shows you how to spell "algorithm".
Does work, but uses lots of memory. I have tested this algorithm with a 280 MB ASCII file.
The idea is compelling: for huge files do not sort them in one step, but break the file into chunks, sort each chunk separately and merge the files.
The result is correct, but the merge step uses a hugh amount of memory. Split and sort does not use a lot of memory, but merge does, around 700 MB. This is more than loading the whole file into memory and sort then.
As an alternative I have used the following 3 liner
which has used 'only' 400MB RAM to sort a 280 MB ASCII file.
I don't understand from the code why merge is using so much RAM. Otherwise the algorithm would be great.
Sign in to comment