Welcome, guest | Sign In | My Account | Store | Cart
# Find Duplicate Files
# FB36 - 20141012
import sys
import os
import glob
import hashlib

numArgs = len(sys.argv) # of command-line arguments
if numArgs < 2 or numArgs > 3:
    print "USAGE:"
    print "[python] FindDuplicateFiles.py FilePath [FilePath2]"
    print "FilePath2 is optional."
    print "If file path(s) have spaces then add quotes around."
    print "File path(s) must include wildcards in the end"
    print "like ...\*.*"
    os._exit(1)
if numArgs > 1:
    filePath1 = sys.argv[1]
    filePath2 = filePath1
if numArgs > 2:
    filePath2 = sys.argv[2]

fileList1 = glob.glob(filePath1)
fileList2 = glob.glob(filePath2)

fileSizeList1 = []
for fn1 in fileList1:
    fileSizeList1.append(os.path.getsize(fn1))

fileSizeList2 = []
for fn2 in fileList2:
    fileSizeList2.append(os.path.getsize(fn2))

# Find groups of files which have same size
fileSizeGroups = dict()
for i in range(len(fileList1)):
    if fileSizeList1[i] not in fileSizeGroups:
        fileSizeGroups[fileSizeList1[i]] = [fileList1[i]]
    elif fileList1[i] not in fileSizeGroups[fileSizeList1[i]]:
        fileSizeGroups[fileSizeList1[i]].append(fileList1[i])

for i in range(len(fileList2)):
    if fileSizeList2[i] not in fileSizeGroups:
        fileSizeGroups[fileSizeList2[i]] = [fileList2[i]]
    elif fileList2[i] not in fileSizeGroups[fileSizeList2[i]]:
        fileSizeGroups[fileSizeList2[i]].append(fileList2[i])
    
# Find groups of files which have same size and same hash
fileHashGroups = dict()
for fileSize in fileSizeGroups.keys():
    if len(fileSizeGroups[fileSize]) > 1:
        for fn in fileSizeGroups[fileSize]:
            fileHash = hashlib.sha256(open(fn, 'rb').read()).hexdigest()
            if fileHash not in fileHashGroups:
                fileHashGroups[fileHash] = [fn]
            elif fn not in fileHashGroups[fileHash]:
                fileHashGroups[fileHash].append(fn)

# Output groups of files which have same size and same hash
for fileHash in fileHashGroups.keys():
    if len(fileHashGroups[fileHash]) > 1:
        for fn in fileHashGroups[fileHash]:
            print fn
        print

History