Welcome, guest | Sign In | My Account | Store | Cart

This script looks for files with identical file names. If requested, file sizes are also compared. You can search current directory or a list of directories specified on the command line. Search can be restricted to files with names containing a string, or with names matching a regular expression. Paths to duplicates are printed in groups separated by empty lines.

Python, 111 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python

"""Find duplicate file names.

Command line options:
  h - show help on usage
  s - compare file sizes
  n <text> - restrict to names containing text
  r <regex> - restrict to names containing regex match (overrides -n)

The non-option parameters, if specified, are used as search path.
Otherwise, current directory is used.
"""

import getopt
import os
import os.path
import re
import sys

def addIfFile(allfiles, dirname, file):
    if os.path.isfile(os.path.join(dirname, file)):
        if file in allfiles:
            allfiles[file].append(dirname)
        else:
            allfiles[file] = [dirname]

def checkdup(allfiles, dirname, files):
    for n in files:
        addIfFile(allfiles, dirname, n)

class CheckdupName:
    def __init__(self, name):
        self.__name = name
    def __call__(self, allfiles, dirname, files):
        for n in files:
            if self.__name in n:
                addIfFile(allfiles, dirname, n)

class CheckdupRegex:
    def __init__(self, pattern):
        self.__re = re.compile(pattern)
    def __call__(self, allfiles, dirname, files):
        for n in files:
            if self.__re.search(n):
                addIfFile(allfiles, dirname, n)

class HelpException(Exception):
    pass

def printDupNames(duplist):
    for n, d in duplist:
        for dd in d:
            pj = os.path.normpath(os.path.join(dd, n))
            print pj
        print

def printDupNameSizes(duplist):
    for n, d in duplist:
        szgroups = {}
        for dd in d:
            pj = os.path.normpath(os.path.join(dd, n))
            sz = os.stat(pj).st_size
            if sz in szgroups:
                szgroups[sz].append(pj)
            else:
                szgroups[sz] = [pj]
        for sz, g in szgroups.iteritems():
            if len(g) > 1:
                for n in g:
                    print n
                print

def main(argv):
    optlist, args = getopt.getopt(argv, "hsn:r:")
    visit = checkdup
    prndup = printDupNames
    for o, a in optlist:
        if o == "-h":
            raise HelpException()
        if o == "-s":
            prndup = printDupNameSizes
        if o == "-n":
            visit = CheckdupName(a)
        if o == "-r":
            visit = CheckdupRegex(a)
    paths = ["."]
    if args:
        paths = args
    allfiles = {}
    for path in paths:
        os.path.walk(path, visit, allfiles)
    duplist = [x for x in allfiles.iteritems() if len(x[1])>1]
    duplist.sort()
    prndup(duplist)

if __name__ == "__main__":
    try:
        main(sys.argv[1:])
    except getopt.GetoptError, e:
        print >> sys.stderr, e
        print >> sys.stderr, "Try '%s -h' for help." % sys.argv[0]
        raise SystemExit(2)
    except re.error, e:
        print >> sys.stderr, "Malformed regex pattern:"
        print >> sys.stderr, e
        raise SystemExit(2)
    except HelpException, e:
        print "Usage: %s [options] [path [path ...]]" % sys.argv[0]
        print
        print __doc__

See Bill Bumgarner's Dupinator http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/362459 for a similar script that automatically deletes duplicates.