Welcome, guest | Sign In | My Account | Store | Cart

On several occasions I wanted to peruse an FTP site for a specific rpm within a certain age range and with a particular pattern to its filename and none of the tools available gave me that functionality. This recipe gives a find-like tool to the world of FTP. Great for cron jobs that download new RPMs that fit some tricky condition (e.g. less than 1 meg, less than a week old, ends in x86_64.tar.gz etc).

Python, 225 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/python
import fnmatch, ftplib, optparse, os, stat, string, sys, time

class FtpWalker:
    def __init__( self, site, user, passwd ):
        self.ftp = ftplib.FTP( site, user, passwd )
    def cd( self, path ):
        try:
            self.ftp.cwd( path )
        except:
            return False
        else:
            return True
    def pwd( self ):
        return self.ftp.pwd()
    def get( self, fileinfo, binary=True, callback=None ):
        status = alreadydownloaded( fileinfo )
        if status == DOWNLOAD_NONE: return
        if not callback:
            localfile = createfile( fileinfo, binary, status == DOWNLOAD_PARTIAL )
            callback  = localfile.write
        try:
            filename = fileinfo.longname
            getstr = "RETR %s" % fileinfo.name
            if binary:
                if status == DOWNLOAD_PARTIAL:
                    self.ftp.retrbinary( getstr, callback, rest=os.path.getsize( filename ) )
                else:
                    self.ftp.retrbinary( getstr, callback )
            else:
                self.ftp.retrlines( getstr, callback )
        finally: 
            if localfile:
                localfile.close()
                os.utime( filename, (fileinfo.date, fileinfo.date) )
                os.chmod( filename, fileinfo.mode )
    def ls( self, cwd ):
        lines = []
        self.ftp.retrlines( "LIST", lines.append )
        return map( lambda x: extract_info( cwd, x ), lines ) 

DOWNLOAD_FULL, DOWNLOAD_PARTIAL, DOWNLOAD_NONE = 0, 1, 2 

def alreadydownloaded( fileinfo ):
    f = fileinfo.longname
    if os.path.isfile( f ):
        ldate, rdate = os.path.getmtime( f ), fileinfo.date
        lsize, rsize = os.path.getsize( f ),  fileinfo.size
        if round( ldate ) == round( rdate ):
            if lsize == rsize:
                return DOWNLOAD_NONE    # already downloaded
            else:
                return DOWNLOAD_PARTIAL # partially downloaded
        else:
            newfilename = mknewversion( fileinfo.path, fileinfo.name )
            os.rename( fileinfo.longname, newfilename )
            return DOWNLOAD_FULL        # old version, rename
    else:
        return DOWNLOAD_FULL            # no file, download

def mknewversion( path, filename ):
    version = 1
    def mkversion( version ):
        return os.path.join( path, ".%s.%03d" % (filename, version) )
    longname = mkversion( version )
    while os.path.exists( longname ):
        version += 1
        longname = mkversion( version )
    return longname
 
def iff( test_, then_, else_ ): # then_, else_ always get evaled so pls be atoms
    if test_:
        return then_
    else:
        return else_

def createfile( fileinfo, binary, append ):
    fname = fileinfo.longname
    if not os.path.isdir( fileinfo.path ):
        os.makedirs( fileinfo.path )
    permissions = iff( binary, 'wb', 'w' )
    if append and os.path.isfile( fname ):
        permissions += 'a'
        perm = os.stat( fname )[stat.ST_MODE]
        if not perm & stat.S_IWUSR:
            os.chmod( fname, perm | stat.S_IWUSR )
    return file( fname, permissions )

curr_year_fmt, prev_year_fmt, unified_fmt = '%b %d %H:%M', '%b %d  %Y', '%Y-%m-%d-%H:%M'

def updatetuple( t, i, x ): # insert x into the ith field of tuple, t
    l = list( t )
    return tuple( l[:i] + [x] + l[i+1:] )

def parsePrevYear( date ): return time.strptime( date, prev_year_fmt )
def parseCurrYear( date ):
    datewith1900 = time.strptime( date, curr_year_fmt )
    currentYear  = time.gmtime()[0]
    return updatetuple( datewith1900, 0, currentYear )

def dateParser( date ): return iff( ':' in date, parseCurrYear, parsePrevYear )
def parseDate( date ):  return time.mktime( dateParser( date )( date ) )

def displayDate( date ):
    date_struct, curr_struct = time.gmtime( date ), time.gmtime()
    date_year, curr_year = date_struct[0], curr_struct[0]
    year_fmt = iff( date_year == curr_year, curr_year_fmt, prev_year_fmt )
    return time.strftime( year_fmt, date_struct )

R_MSK, W_MSK, X_MSK, Z_MSK =   4,   2,   1,   0
R_STR, W_STR, X_STR, Z_STR = 'r', 'w', 'x', '-'

def str2mode( str ):
    r, w, x = str[0] == R_STR,  str[1] == W_STR,  str[2] == X_STR
    return iff( r, R_MSK, Z_MSK ) | iff( w, W_MSK, Z_MSK ) | iff( x, X_MSK, Z_MSK )

def mode2str( mode ):
    r, w, x = mode & R_MSK, mode & W_MSK, mode & X_MSK
    return iff( r, R_STR, Z_STR ) + iff( w, W_STR, Z_STR ) + iff( x, X_STR, Z_STR )

def str2fullmode( str ):
    u, g, o = str[0:3], str[3:6], str[6:9]
    return str2mode( u ) << 6 | str2mode( g ) << 3 | str2mode( o )

def fullmode2str( mode ):
    u, g, o = mode >> 6 & 0x7, mode >> 3 & 0x7, mode & 0x7
    return mode2str( u ) + mode2str( g ) + mode2str( o )

def str2perm( str ):
    return str[0] == 'd', str[0] == 'l', str2fullmode( str[1:] )

def perm2str( isdir, islink, mode ):
    return iff( isdir, 'd', iff( islink, 'l', '-' ) ) + fullmode2str( mode )

def extract_info( cwd, line ):
    fullmode, links, owner, group, size, rest = line.split( None, 5 )
    isdir, islink, mode = str2perm( fullmode )
    dateStr, name = rest[:12], rest[13:]
    date = parseDate( dateStr )
    return FileInfo( cwd, name, fullmode, isdir, islink, mode, int( links ), owner, group, int( size ), dateStr, date)

class FileInfo:
    def __init__( self, path, name, modeStr, isdir, islink, mode, links, owner, group, size, dateStr, date, line ):
        self.path, self.name, self.isdir, self.islink = path, name, isdir, islink
        self.modeStr, self.mode, self.owner, self.group = modeStr, mode, owner, group
        self.links, self.size, self.dateStr, self.date = links, size, dateStr, date
        self.longname, self.age, self.line = os.path.join( path, name ), now - self.date, line

def dropslashes( str ): 
    i, n = 0, len( str )
    while i < n and str[i] == '/': i += 1
    return str[i:]

def excluded( exclude_patterns, dir ):
    for exclude_pattern in exclude_patterns:
        if pattern( exclude_pattern, dir ):
            return True
    return False

def listSiteGen( walker, dir, opts ):
    path = walker.pwd()
    if not excluded( opts.exclude, dir ) and walker.cd( dir ):
        for info in walker.ls( dropslashes( os.path.join( path, dir ) ) ):
            if info.isdir:
                for rec_info in listSiteGen( walker, info.name, opts ):
                    yield rec_info
            else:
                yield info
    walker.cd( path )

def ftpfind( walker, dir, opts ):
    for fileinfo in listSiteGen( walker, dir, opts ):
        if opts.expr( fileinfo ):
            print "%s" % opts.printer( fileinfo )
            if not opts.test:
                walker.get( fileinfo )

def date( d, f=None ):
    if f:
        return time.mktime( time.strptime( d, f ) )
    else:
        return parseDate( d )

def pattern( p, v ): return fnmatch.fnmatch( v, p )
kilobyte = 1024; megabyte = kilobyte * kilobyte; gigabyte = kilobyte * megabyte; terabyte = kilobyte * gigabyte
second = 1; minute = 60*second; hour = 60*minute; day = 24*hour; week = 7*day; year = 52*week
 
def expr_cb( option, opt_str, value, parser ): parser.values.expr = eval( "lambda file: " + value )
def print_cb( option, opt_str, value, parser ): parser.values.printer = eval( "lambda file: " + value )

now = time.mktime( time.gmtime() ) # used by age filter

def daystart_cb( option, opt_str, value, parser ):
    global now
    x = time.gmtime()
    start_of_day = x[0], x[1], x[2], 0, 0, 0, x[6], x[7], x[8]
    now = time.mktime( start_of_day )

def_printer=lambda file: file.line
def_expr=lambda file: True

def parse_command_line():
    parser = optparse.OptionParser()
    parser.set_defaults( user="anonymous", password="ftpfind@sf.net", expr=def_expr, test=False, exclude=[], printer=def_printer  )
    parser.add_option( "-e", "--expr",     action="callback", callback=expr_cb, type="string", help="use the python expression, lambda file: <EXPR>, as a filter (must return boolean)", metavar="EXPR" )
    parser.add_option( "-p", "--password", help="specify the password to use", metavar="PASSWD" )
    parser.add_option( "--print", action="callback", callback=print_cb, type="string", help="use the printer, lambda file: <EXPR>, to print file summary (must return string)", metavar="EXPR" )
    parser.add_option( "-s", "--daystart", action="callback", callback=daystart_cb, help="calculate ages from today @ 00:00" )
    parser.add_option( "-t", "--test",     action ="store_true", help="print filename but do not perform file transfer" )
    parser.add_option( "-u", "--user",     help="specify the username to use", metavar="USER" )
    parser.add_option( "-x", "--exclude",  action="append", help="do not traverse this directory", metavar="DIR" )
    return parser.parse_args()

if __name__ == '__main__':
    opts, args = parse_command_line()
    site, dirs = args[0], args[1:]
    if len( dirs ) == 0: dirs = ['/']
    try:
        walker = FtpWalker( site, opts.user, opts.password )
    except:
        print "Couldn't authenticate '%s' with password '%s' on %s" % (opts.user, opts.password, site)
        sys.exit(3)
    else:
        for dir in dirs:
            ftpfind( walker, dir, opts )

There are already a few FTP recipes in the cookbook, why another one? Well http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302592 is a little on the complicated side and you have to write an XML config file to use it - not my cup of tea; http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/275594 and http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/327141 are simple and straightforward but limited in their functionality. ftpmirror.py in the python distribution is closer to the mark but it didn't have the command-line flexibility I was after. What I needed was something that could be parameterized with various conditions like the Unix 'find' command so that I could, for instance, download the newest version of the foo*.x86_64.tar.gz provided it was less than two megabytes and less than a week old from ftp.bar.edu, like so:

ftpfind.py --expr='pattern( "foox86_64.tar.gz", file.name ) and file.age There are already a few FTP recipes in the cookbook, why another one? Well http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302592 is a little on the complicated side and you have to write an XML config file to use it - not my cup of tea; http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/275594 and http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/327141 are simple and straightforward but limited in their functionality. ftpmirror.py in the python distribution is closer to the mark but it didn't have the command-line flexibility I was after. What I needed was something that could be parameterized with various conditions like the Unix 'find' command so that I could, for instance, download the newest version of the foo.x86_64.tar.gz provided it was less than two megabytes and less than a week old from ftp.bar.edu, like so:

ftpfind.py --expr='pattern( "foo*x86_64.tar.gz", file.name ) and file.age

3 comments

Guy Argo (author) 18 years, 9 months ago  # | flag

previous doc prematurely truncated.

ftpfind.py --expr='pattern( "foo*x86_64.tar.gz", file.name ) and



file.age &#60; week and file.size &#60; 2*megabyte' ftp.bar.edu



"expr" can be any arbitrary python expression that's assumed to be the body
of a lambda of the form: lambda file: "expr". The file parameter is of type
FileInfo and has the following fields pre-computed to make life easy:

path  current path (string)
name  current filename (string)
longname file's path and name intelligently concatenated (string)
modeStr the permisions part of the ls listing, e.g. '-rwxrwxrwx' (string)
isdir   is the current file a directory (boolean)
islink  is the current file a link (boolean)
mode    the mode of the file as an octal number (int)
n.b. octal literals in python start with a 0, e.g. 0777
links   number of links to a file
owner   file's owner (string)
group   file's group (string)
size    file's size  (int)
dateStr file's modification date, e.g. 'Jul  6  2:04'
date    file's modification date expressed in secs since ... (int)
age     file's age in secs (int)
line    the original ls line obtained from ftp (string)
Guy Argo (author) 18 years, 9 months ago  # | flag

doc part 3.

All times are in seconds, all sizes are in bytes.
To make the age queries easier to write the following constants are
available: minute, hour, day, week, year. To make size queries easier,
the following constants are available: kilobyte(2**10), megabyte(2**20),
gigabyte(2**30), terabyte(2**40). To make unix style globbing simpler,
the function, pattern, is defined:

def pattern( pattern, candidate ): return fnmatch.fnmatch( candidate, pattern )

Of course you can define your own predicates and import then in the body
of the lambda.

Other useful features: file downloads are automatically resumed if previously
interrupted, files are intelligiently renamed (from foo to .foo.) to
avoid clobbering.

Full command-line options:

-e --expr EXPR        filters files using 'lambda file: EXPR'. default: --expr=True
-p --password PASSWD  use PASSWD to authenticate on ftp server. default: ftpfind@sf.net
--print EXPR          print callback. default: --print='"%s" % file.line'
-s --daystart         compute file ages from midnight this morning
-t --test             print filenames but do not perform gets
-u --user=USER        use USER to authenticate on ftp server. default: anonymous
-x --exclude=DIR      add the DIR pattern to the list of excluded directories

Syntax:

ftpfind.py [option ...] server [dir ...]

I'll be putting this up on SourceForge when I get some time so others
can improve it.
Guy Argo (author) 18 years, 9 months ago  # | flag

typo in extract_info. Missing parameter, line, in the call to the FileInfo constructor- it should have read...

def extract_info( cwd, line ):
    fullmode, links, owner, group, size, rest = line.split( None, 5 )
    isdir, islink, mode = str2perm( fullmode )
    dateStr, name = rest[:12], rest[13:]
    date = parseDate( dateStr )
    return FileInfo( cwd, name, fullmode, isdir, islink, mode, int( links ), owner, group, int( size ), dateStr, date, line)