Welcome, guest | Sign In | My Account | Store | Cart

This module can be used to split any file, text or binary to equal sized chunks. It can also combine the chunks back to recreate the original file.

Python, 196 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
""" FileSplitter - Simple Python file split/concat module.

    What it does
    -==========-
    
    1. Split a text/binary file into equal sized chunks
       and save them separately. 

    2. Concat existing chunks and recreate
       original file.

    Author: Anand Pillai
    Copyright : None, (Public Domain)
"""

import os, sys

class FileSplitterException(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return str(self.value)

def usage():
    return """\nUsage: FileSplitter.py -i <inputfile> -n <chunksize> [option]\n
    Options:\n
    -s, --split  Split file into chunks
    -j, --join   Join chunks back to file.
    """

class FileSplitter:
    """ File splitter class """

    def __init__(self):

        # cache filename
        self.__filename = ''
        # number of equal sized chunks
        self.__numchunks = 5
        # Size of each chunk
        self.__chunksize = 0
        # Optional postfix string for the chunk filename
        self.__postfix = ''
        # Program name
        self.__progname = "FileSplitter.py"
        # Action
        self.__action = 0 # split

    def parseOptions(self, args):

        import getopt

        try:
            optlist, arglist = getopt.getopt(args, 'sji:n:', ["split=", "join="])
        except getopt.GetoptError, e:
            print e
            return None

        for option, value in optlist:
            if option.lower() in ('-i', ):
                self.__filename = value
            elif option.lower() in ('-n', ):
                self.__numchunks = int(value)
            elif option.lower() in ('-s', '--split'):
                self.__action = 0 # split
            elif option.lower() in ('-j', '--join'):
                self.__action = 1 # combine

        if not self.__filename:
            sys.exit("Error: filename not given")
        
    def do_work(self):
        if self.__action==0:
            self.split()
        elif self.__action==1:
            self.combine()
        else:
            return None
        
    def split(self):
        """ Split the file and save chunks
        to separate files """

        print 'Splitting file', self.__filename
        print 'Number of chunks', self.__numchunks, '\n'
        
        try:
            f = open(self.__filename, 'rb')
        except (OSError, IOError), e:
            raise FileSplitterException, str(e)

        bname = (os.path.split(self.__filename))[1]
        # Get the file size
        fsize = os.path.getsize(self.__filename)
        # Get size of each chunk
        self.__chunksize = int(float(fsize)/float(self.__numchunks))

        chunksz = self.__chunksize
        total_bytes = 0

        for x in range(self.__numchunks):
            chunkfilename = bname + '-' + str(x+1) + self.__postfix

            # if reading the last section, calculate correct
            # chunk size.
            if x == self.__numchunks - 1:
                chunksz = fsize - total_bytes

            try:
                print 'Writing file',chunkfilename
                data = f.read(chunksz)
                total_bytes += len(data)
                chunkf = file(chunkfilename, 'wb')
                chunkf.write(data)
                chunkf.close()
            except (OSError, IOError), e:
                print e
                continue
            except EOFError, e:
                print e
                break

        print 'Done.'

    def sort_index(self, f1, f2):

        index1 = f1.rfind('-')
        index2 = f2.rfind('-')
        
        if index1 != -1 and index2 != -1:
            i1 = int(f1[index1:len(f1)])
            i2 = int(f2[index2:len(f2)])
            return i2 - i1
        
    def combine(self):
        """ Combine existing chunks to recreate the file.
        The chunks must be present in the cwd. The new file
        will be written to cwd. """

        import re
        
        print 'Creating file', self.__filename
        
        bname = (os.path.split(self.__filename))[1]
        bname2 = bname
        
        # bugfix: if file contains characters like +,.,[]
        # properly escape them, otherwise re will fail to match.
        for a, b in zip(['+', '.', '[', ']','$', '(', ')'],
                        ['\+','\.','\[','\]','\$', '\(', '\)']):
            bname2 = bname2.replace(a, b)
            
        chunkre = re.compile(bname2 + '-' + '[0-9]+')
        
        chunkfiles = []
        for f in os.listdir("."):
            print f
            if chunkre.match(f):
                chunkfiles.append(f)


        print 'Number of chunks', len(chunkfiles), '\n'
        chunkfiles.sort(self.sort_index)

        data=''
        for f in chunkfiles:

            try:
                print 'Appending chunk', os.path.join(".", f)
                data += open(f, 'rb').read()
            except (OSError, IOError, EOFError), e:
                print e
                continue

        try:
            f = open(bname, 'wb')
            f.write(data)
            f.close()
        except (OSError, IOError, EOFError), e:
            raise FileSplitterException, str(e)

        print 'Wrote file', bname

def main():
    import sys

    if len(sys.argv)<2:
        sys.exit(usage())
        
    fsp = FileSplitter()
    fsp.parseOptions(sys.argv[1:])
    fsp.do_work()

if __name__=="__main__":
    main()

Often we need to split big files into many chunks either for saving them to disks, uploading to a web-site or for some other reason. I used to rely on 3rd party programs before for this task, but never could find a program handy when needed.

As usual python excels in such 'scripting' tasks and this script makes the job a breeze. :-)

5 comments

Anand (author) 20 years, 6 months ago  # | flag

Refactored...

# Modified on 18th Oct
I have modified this recipe to print a Usage string,
and it now takes command line options. While combining
it does not need the number of chunks and works with
all the chunks in the current directory.

-Anand
Anthony DiGregorio 20 years ago  # | flag

Files larger than physical memory on machine. I made a slight modification to the combine function. See how I appended "data" to the file from each chuck, instead of holding a large "data" object in memory.

Anthony DiGregorio 20 years ago  # | flag

File size larger than Physical Memory available. Here is the code:

try:
    cmbf = open(bname, 'ab')
    for f in chunkfiles:

        try:
            print 'Appending chunk', os.path.join(".", f)
            #data += open(f, 'rb').read()
            data = open(f, 'rb').read()
            cmbf.write(data)
        except (OSError, IOError, EOFError), e:
            print e
            continue
    cmbf.close()

except (OSError, IOError, EOFError), e:
    raise FileSplitterException, str(e)

#try:
#    f = open(bname, 'wb')
#    f.write(data)
#    f.close()
#except (OSError, IOError, EOFError), e:
#    raise FileSplitterException, str(e)

print 'Wrote file', bname
jc 19 years, 2 months ago  # | flag

constant filename length. Here is the code to get 3 digits for numeration of each file (for split mode, near of line #107 for me):

chunkfilename = bname + ('-%03d' % (x+1)) + self.__postfix
jc 19 years, 2 months ago  # | flag

Output to the same directory. Here is the code to generate each files in the same directory the the source (near line #94 for me):

#bname = (os.path.split(self.__filename))[1]
## output to the same directory
bname = self.__filename