Welcome, guest | Sign In | My Account | Store | Cart
"""splits a large text file into smaller ones, based on line count

Original is left unmodified.

Resulting text files are stored in the same directory as the original file.

Useful for breaking up text-based logs or blocks of login credentials.

"""

import os

def split_file(filepath, lines_per_file=100):
    """splits file at `filepath` into sub-files of length `lines_per_file`
    """
    lpf = lines_per_file
    path, filename = os.path.split(filepath)
    with open(filepath, 'r') as r:
        name, ext = os.path.splitext(filename)
        try:
            w = open(os.path.join(path, '{}_{}{}'.format(name, 0, ext)), 'w')
            for i, line in enumerate(r):
                if not i % lpf:
                    #possible enhancement: don't check modulo lpf on each pass
                    #keep a counter variable, and reset on each checkpoint lpf.
                    w.close()
                    filename = os.path.join(path,
                                            '{}_{}{}'.format(name, i, ext))
                    w = open(filename, 'w')
                w.write(line)
        finally:
            w.close()

def test():
    """demonstrates the utility of split_file() function"""
    testpath = "/tmp/test_split_file/"
    if not os.path.exists(testpath): os.mkdir(testpath)
    testfile = os.path.join(testpath, "test.txt")
    with open(testfile, 'w') as w:
        for i in range(1, 10001):
            w.write("email{}@myserver.net\tb4dpassw0rd{}\n".format(i, i))
    split_file(testfile, 1000)

Diff to Previous Revision

--- revision 4 2012-02-17 19:17:25
+++ revision 5 2012-02-19 23:40:47
@@ -1,41 +1,42 @@
-"""splits a large text file into smaller ones, based on line count, original unmodified.
+"""splits a large text file into smaller ones, based on line count
+
+Original is left unmodified.
 
 Resulting text files are stored in the same directory as the original file.
 
-Useful for breaking up small text-based logs or blocks of login credentials.
+Useful for breaking up text-based logs or blocks of login credentials.
 
 """
 
 import os
 
-def split_file(file_loc, lines_per_file=100):
-    """splits file at `file_loc` into sub-files of length `lines_per_file`
-returns 1 if last file is smaller than the rest"""
+def split_file(filepath, lines_per_file=100):
+    """splits file at `filepath` into sub-files of length `lines_per_file`
+    """
     lpf = lines_per_file
-    baseloc, basefile = os.path.split(file_loc)
-    with open(file_loc, 'r') as r:
-        next_start = 0
-        name, ext = basefile.split('.')
-        for i in xrange(lpf, lines_in(r) + lpf, lpf):
-            filename = "{}_{}.{}".format(name, next_start, ext)
-            with open(os.path.join(baseloc, filename), 'w') as w:
-                for ix in xrange(next_start, i):
-                    content = r.readline()
-                    if not content: return 1
-                    w.write(content)
-            next_start = i
-        return 0
+    path, filename = os.path.split(filepath)
+    with open(filepath, 'r') as r:
+        name, ext = os.path.splitext(filename)
+        try:
+            w = open(os.path.join(path, '{}_{}{}'.format(name, 0, ext)), 'w')
+            for i, line in enumerate(r):
+                if not i % lpf:
+                    #possible enhancement: don't check modulo lpf on each pass
+                    #keep a counter variable, and reset on each checkpoint lpf.
+                    w.close()
+                    filename = os.path.join(path,
+                                            '{}_{}{}'.format(name, i, ext))
+                    w = open(filename, 'w')
+                w.write(line)
+        finally:
+            w.close()
 
-def lines_in(fileobj):
-    """returns total '\n' occurances, resets file cursor to byte 1"""
-    f = fileobj
-    char = f.read(1)
-    if not char: return -1
-    
-    nl = 1
-    while True:
-        if not char:
-            f.seek(1)
-            return nl
-        if char == '\n': nl += 1
-        char = f.read(1)
+def test():
+    """demonstrates the utility of split_file() function"""
+    testpath = "/tmp/test_split_file/"
+    if not os.path.exists(testpath): os.mkdir(testpath)
+    testfile = os.path.join(testpath, "test.txt")
+    with open(testfile, 'w') as w:
+        for i in range(1, 10001):
+            w.write("email{}@myserver.net\tb4dpassw0rd{}\n".format(i, i))
+    split_file(testfile, 1000)

History