This module can be used to split any file, text or binary to equal sized chunks. It can also combine the chunks back to recreate the original file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | """ FileSplitter - Simple Python file split/concat module.
What it does
-==========-
1. Split a text/binary file into equal sized chunks
and save them separately.
2. Concat existing chunks and recreate
original file.
Author: Anand Pillai
Copyright : None, (Public Domain)
"""
import os, sys
class FileSplitterException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
def usage():
return """\nUsage: FileSplitter.py -i <inputfile> -n <chunksize> [option]\n
Options:\n
-s, --split Split file into chunks
-j, --join Join chunks back to file.
"""
class FileSplitter:
""" File splitter class """
def __init__(self):
# cache filename
self.__filename = ''
# number of equal sized chunks
self.__numchunks = 5
# Size of each chunk
self.__chunksize = 0
# Optional postfix string for the chunk filename
self.__postfix = ''
# Program name
self.__progname = "FileSplitter.py"
# Action
self.__action = 0 # split
def parseOptions(self, args):
import getopt
try:
optlist, arglist = getopt.getopt(args, 'sji:n:', ["split=", "join="])
except getopt.GetoptError, e:
print e
return None
for option, value in optlist:
if option.lower() in ('-i', ):
self.__filename = value
elif option.lower() in ('-n', ):
self.__numchunks = int(value)
elif option.lower() in ('-s', '--split'):
self.__action = 0 # split
elif option.lower() in ('-j', '--join'):
self.__action = 1 # combine
if not self.__filename:
sys.exit("Error: filename not given")
def do_work(self):
if self.__action==0:
self.split()
elif self.__action==1:
self.combine()
else:
return None
def split(self):
""" Split the file and save chunks
to separate files """
print 'Splitting file', self.__filename
print 'Number of chunks', self.__numchunks, '\n'
try:
f = open(self.__filename, 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split(self.__filename))[1]
# Get the file size
fsize = os.path.getsize(self.__filename)
# Get size of each chunk
self.__chunksize = int(float(fsize)/float(self.__numchunks))
chunksz = self.__chunksize
total_bytes = 0
for x in range(self.__numchunks):
chunkfilename = bname + '-' + str(x+1) + self.__postfix
# if reading the last section, calculate correct
# chunk size.
if x == self.__numchunks - 1:
chunksz = fsize - total_bytes
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
total_bytes += len(data)
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
print 'Done.'
def sort_index(self, f1, f2):
index1 = f1.rfind('-')
index2 = f2.rfind('-')
if index1 != -1 and index2 != -1:
i1 = int(f1[index1:len(f1)])
i2 = int(f2[index2:len(f2)])
return i2 - i1
def combine(self):
""" Combine existing chunks to recreate the file.
The chunks must be present in the cwd. The new file
will be written to cwd. """
import re
print 'Creating file', self.__filename
bname = (os.path.split(self.__filename))[1]
bname2 = bname
# bugfix: if file contains characters like +,.,[]
# properly escape them, otherwise re will fail to match.
for a, b in zip(['+', '.', '[', ']','$', '(', ')'],
['\+','\.','\[','\]','\$', '\(', '\)']):
bname2 = bname2.replace(a, b)
chunkre = re.compile(bname2 + '-' + '[0-9]+')
chunkfiles = []
for f in os.listdir("."):
print f
if chunkre.match(f):
chunkfiles.append(f)
print 'Number of chunks', len(chunkfiles), '\n'
chunkfiles.sort(self.sort_index)
data=''
for f in chunkfiles:
try:
print 'Appending chunk', os.path.join(".", f)
data += open(f, 'rb').read()
except (OSError, IOError, EOFError), e:
print e
continue
try:
f = open(bname, 'wb')
f.write(data)
f.close()
except (OSError, IOError, EOFError), e:
raise FileSplitterException, str(e)
print 'Wrote file', bname
def main():
import sys
if len(sys.argv)<2:
sys.exit(usage())
fsp = FileSplitter()
fsp.parseOptions(sys.argv[1:])
fsp.do_work()
if __name__=="__main__":
main()
|
Often we need to split big files into many chunks either for saving them to disks, uploading to a web-site or for some other reason. I used to rely on 3rd party programs before for this task, but never could find a program handy when needed.
As usual python excels in such 'scripting' tasks and this script makes the job a breeze. :-)
Refactored...
Files larger than physical memory on machine. I made a slight modification to the combine function. See how I appended "data" to the file from each chuck, instead of holding a large "data" object in memory.
File size larger than Physical Memory available. Here is the code:
constant filename length. Here is the code to get 3 digits for numeration of each file (for split mode, near of line #107 for me):
Output to the same directory. Here is the code to generate each files in the same directory the the source (near line #94 for me):