# Find Duplicate Files # FB36 - 20141012 import sys import os import glob import hashlib numArgs = len(sys.argv) # of command-line arguments if numArgs < 2 or numArgs > 3: print "USAGE:" print "[python] FindDuplicateFiles.py FilePath [FilePath2]" print "FilePath2 is optional." print "If file path(s) have spaces then add quotes around." print "File path(s) must include wildcards in the end" print "like ...\*.*" os._exit(1) if numArgs > 1: filePath1 = sys.argv[1] filePath2 = filePath1 if numArgs > 2: filePath2 = sys.argv[2] fileList1 = glob.glob(filePath1) fileList2 = glob.glob(filePath2) fileSizeList1 = [] for fn1 in fileList1: fileSizeList1.append(os.path.getsize(fn1)) fileSizeList2 = [] for fn2 in fileList2: fileSizeList2.append(os.path.getsize(fn2)) # Find groups of files which have same size fileSizeGroups = dict() for i in range(len(fileList1)): if fileSizeList1[i] not in fileSizeGroups: fileSizeGroups[fileSizeList1[i]] = [fileList1[i]] elif fileList1[i] not in fileSizeGroups[fileSizeList1[i]]: fileSizeGroups[fileSizeList1[i]].append(fileList1[i]) for i in range(len(fileList2)): if fileSizeList2[i] not in fileSizeGroups: fileSizeGroups[fileSizeList2[i]] = [fileList2[i]] elif fileList2[i] not in fileSizeGroups[fileSizeList2[i]]: fileSizeGroups[fileSizeList2[i]].append(fileList2[i]) # Find groups of files which have same size and same hash fileHashGroups = dict() for fileSize in fileSizeGroups.keys(): if len(fileSizeGroups[fileSize]) > 1: for fn in fileSizeGroups[fileSize]: fileHash = hashlib.sha256(open(fn, 'rb').read()).hexdigest() if fileHash not in fileHashGroups: fileHashGroups[fileHash] = [fn] elif fn not in fileHashGroups[fileHash]: fileHashGroups[fileHash].append(fn) # Output groups of files which have same size and same hash for fileHash in fileHashGroups.keys(): if len(fileHashGroups[fileHash]) > 1: for fn in fileHashGroups[fileHash]: print fn print