Welcome, guest | Sign In | My Account | Store | Cart
#!/usr/bin/python

"""
NAME

    shuffle-merge -- shuffle-merge text files

SYNOPSIS
    %(progname)s [OPTIONS] <File Name Prefix>

DESCRIPTION
    shuffle-merge merges a number of text files. The order of merging is
    selected with a random policy.
    
OPTIONS:
    Arguments:
    --help 
      Print a summary of the program options and exit.
    
    --nprocs=<int>, -n <int>
      number of processors [default=8]
    
    --maxlines=<int>, -m <int>
      max number of lines read [default=20]
      
"""

__rev = "1.0"
__author__ = 'Alexandru Iosup'
__email__ = 'A.Iosup at ewi.tudelft.nl'
__file__ = 'shuffle-merge.py'
__version__ = '$Revision: %s$' % __rev
__date__ = '$Date: 2005/08/15 16:59:00 $'
__copyright__ = 'Copyright (c) 2005 Alexandru IOSUP'
__license__ = 'Python'


import sys
import os
import getopt
import string 
import random
import time


def ShuffleMerge( InFilePrefix, NProcs, MaxLines ):
    """ 
    shuffle-merges files InFilePrefix_X, X in { 0, 1, ... NProcs } and
    stores the result into sm-InFilePrefix.
    
    Notes: does NOT check if the input files are available.
    """
    
    NProcs = int(NProcs)
    MaxLines = int(MaxLines)
    
    #-- init random seed
    random.seed(time.time())
    
    
    OutFileName = "sm-%s" % InFilePrefix
    OutFile = open( OutFileName, "w" )
    
    InFileNames = {}
    InFiles = {}
    InFileFinished = {}
    
    ProcsIDList = range(NProcs)
    
    for index in ProcsIDList:
        InFileNames[index] = "%s_%d" % (InFilePrefix, index)
        InFiles[index] = open( InFileNames[index], "r" )
        InFileFinished[index] = 0
        
    nReadLines = 0
    while 1:
        
        #-- make a list of all input files not finished yet
        ListOfNotFinished = []
        for index in ProcsIDList:
            if InFileFinished[index] == 0:
                ListOfNotFinished.append(index)
                
        #-- randomly select an input file
        lenListOfNotFinished = len(ListOfNotFinished)
        if lenListOfNotFinished == 0:
            break
        elif lenListOfNotFinished == 1:
            ProcID = ListOfNotFinished[0]
        else: 
            # at least 2 elements in this list -> pick at random the proc ID
            ProcID = ListOfNotFinished[random.randint(0, lenListOfNotFinished - 1)]
            
        #-- randomly copy 1 to MaxLines lines of it to the output file
        nLinesToGet = random.randint( 1, MaxLines )
        try:
            for index in range(nLinesToGet):
                line = InFiles[ProcID].readline()
                if len(line) > 0:
                    OutFile.write( line )
                    nReadLines = nReadLines + 1
                    if nReadLines % 10000 == 0:
                        print "nReadLines", nReadLines, "[last read", nLinesToGet, \
                              "from", ProcID, "/", ListOfNotFinished, "]"
                else:
                    InFileFinished[ProcID] = 1
        except KeyError, e:
            print "Got wrong array index:", e
        except IOError, (errno, strerror):
            print "I/O error(%s): %s" % (errno, strerror)
            InFileFinished[ProcID] = 1
        
    print "nReadLines", nReadLines, "[last read", nLinesToGet, \
                  "from", ProcID, "/", ListOfNotFinished, "]"
        
    OutFile.close()
    for index in ProcsIDList:
        InFiles[index].close()
        

def usage(progname):
    print __doc__ % vars() 


def main(argv):                  

    OneCharOpts = "hn:m:"
    MultiCharList = [
        "help", "nprocs=", "maxlines="
        ]
        
    try:                                
        opts, args = getopt.getopt( argv, OneCharOpts, MultiCharList )
    except getopt.GetoptError:
        usage(os.path.basename(sys.argv[0]))
        sys.exit(2)
    
    NProcs = 8
    MaxLines = 20
    FileNamePrefix = "ttt"
    
    for opt, arg in opts:
        if opt in ["-h", "--help"]:
            usage(os.path.basename(sys.argv[0]))
            sys.exit()
        elif opt in ["-n", "--nprocs"]: 
            NProcs = arg.strip() 
        elif opt in ["-m", "--maxlines"]: 
            MaxLines = arg.strip() 
            
    if len(args) >= 1:
        FileNamePrefix = args[0]
            
    ShuffleMerge( FileNamePrefix, NProcs, MaxLines )

if __name__ == "__main__":

    main(sys.argv[1:]) 
    

History