Welcome, guest | Sign In | My Account | Store | Cart
#!/usr/bin/env python
#
# find-duplicates
# by Keith Gaughan <http://talideon.com/>
#
# Finds an lists any duplicate files in the given directories.
#
# Copyright (c) Keith Gaughan, 2008.
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# This license is subject to the laws and courts of the Republic of Ireland.
#

from __future__ import with_statement
import sys
import os
import hashlib
import getopt
import filecmp


USAGE = "Usage: %s [-h] [-m<crc|md5>] <dir>*"


class crc:
    """
    Wraps up zlib.crc32 to make it suitable for use as a faster but less
    accurate alternative to the hashlib.* classes.
    """
    def __init__(self, initial=None):
        self.crc = 0
        if initial is not None:
            self.update(initial)
    def update(self, block):
        import zlib
        self.crc = zlib.crc32(block, self.crc)
    def hexdigest(self):
        return "%X" % self.crc
    def digest(self):
        # Er...
        return self.crc


def all_files(*tops):
    """Lists all files in the given directories."""
    for top in tops:
        for dirname, _, filenames in os.walk(top):
            for f in filenames:
                path = os.path.join(dirname, f)
                if os.path.isfile(path):
                    yield path


def digest(file, method=hashlib.md5):
    with open(file) as f:
        h = method(f.read()).digest()
    return h


def true_duplicates(files):
    """
    Compare the given files, breaking them down into groups with identical
    content.
    """
    while len(files) > 1:
        next_set = []
        this_set = []
        master = files[0]
        this_set.append(master)
        for other in files[1:]:
            if filecmp.cmp(master, other, False):
                this_set.append(other)
            else:
                next_set.append(other)
        if len(this_set) > 1:
            yield this_set
        files = next_set


def group_by(groups, grouper, min_size=1):
    """Breaks each of the groups into smaller subgroups."""
    for group in groups:
        subgroups = {}
        for item in group:
            g = grouper(item)
            if not subgroups.has_key(g):
                subgroups[g] = []
            subgroups[g].append(item)
        for g in subgroups.itervalues():
            if len(g) >= min_size:
                yield g


def usage(message=None):
    global USAGE
    fh = sys.stdout
    exit_code = 0
    if message:
        fh = sys.stderr
        exit_code = 2
        print >>fh, str(message)
    name = os.path.basename(sys.argv[0])
    print >>fh, USAGE % (name,)
    sys.exit(exit_code)


def main():
    try:
        opts, paths = getopt.getopt(sys.argv[1:], "hm:")
    except getopt.GetoptError, err:
        usage(err)
    method = crc
    for o, a in opts:
        if o == "-m":
            if a == "crc":
                method = crc
            elif a == "md5":
                method = hashlib.md5
            else:
                usage("Unknown grouping method: %s" % (a,))
        elif o == "-h":
            usage()
        else:
            usage("Unknown option: %s%s" % (o, a))

    if len(paths) == 0:
        paths = ["."]

    first = True
    groups = [all_files(*paths)]
    for grouper in [os.path.getsize, lambda file: digest(file, method)]:
        groups = group_by(groups, grouper, 2)
    for group in groups:
        for files in sorted(true_duplicates(group)):
            if first:
                first = False
            else:
                print
            for file in files:
                print file


if __name__ == "__main__":
    main()

History