Welcome, guest | Sign In | My Account | Store | Cart

Finds duplicate files which have same size and same content in the same directory or two different directories.

Python, 64 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Find Duplicate Files
# FB36 - 20141012
import sys
import os
import glob
import hashlib

numArgs = len(sys.argv) # of command-line arguments
if numArgs < 2 or numArgs > 3:
    print "USAGE:"
    print "[python] FindDuplicateFiles.py FilePath [FilePath2]"
    print "FilePath2 is optional."
    print "If file path(s) have spaces then add quotes around."
    print "File path(s) must include wildcards in the end"
    print "like ...\*.*"
    os._exit(1)
if numArgs > 1:
    filePath1 = sys.argv[1]
    filePath2 = filePath1
if numArgs > 2:
    filePath2 = sys.argv[2]

fileList1 = glob.glob(filePath1)
fileList2 = glob.glob(filePath2)

fileSizeList1 = []
for fn1 in fileList1:
    fileSizeList1.append(os.path.getsize(fn1))

fileSizeList2 = []
for fn2 in fileList2:
    fileSizeList2.append(os.path.getsize(fn2))

# Find groups of files which have same size
fileSizeGroups = dict()
for i in range(len(fileList1)):
    if fileSizeList1[i] not in fileSizeGroups:
        fileSizeGroups[fileSizeList1[i]] = [fileList1[i]]
    elif fileList1[i] not in fileSizeGroups[fileSizeList1[i]]:
        fileSizeGroups[fileSizeList1[i]].append(fileList1[i])

for i in range(len(fileList2)):
    if fileSizeList2[i] not in fileSizeGroups:
        fileSizeGroups[fileSizeList2[i]] = [fileList2[i]]
    elif fileList2[i] not in fileSizeGroups[fileSizeList2[i]]:
        fileSizeGroups[fileSizeList2[i]].append(fileList2[i])
    
# Find groups of files which have same size and same hash
fileHashGroups = dict()
for fileSize in fileSizeGroups.keys():
    if len(fileSizeGroups[fileSize]) > 1:
        for fn in fileSizeGroups[fileSize]:
            fileHash = hashlib.sha256(open(fn, 'rb').read()).hexdigest()
            if fileHash not in fileHashGroups:
                fileHashGroups[fileHash] = [fn]
            elif fn not in fileHashGroups[fileHash]:
                fileHashGroups[fileHash].append(fn)

# Output groups of files which have same size and same hash
for fileHash in fileHashGroups.keys():
    if len(fileHashGroups[fileHash]) > 1:
        for fn in fileHashGroups[fileHash]:
            print fn
        print

1 comment

Alexander Semenov 9 years, 6 months ago  # | flag

Oh, no. Consider using setdefault() instead of

if fileSizeList1[i] not in fileSizeGroups:
    fileSizeGroups[fileSizeList1[i]] = [fileList1[i]]
elif fileList1[i] not in fileSizeGroups[fileSizeList1[i]]:
    fileSizeGroups[fileSizeList1[i]].append(fileList1[i])
Created by FB36 on Sun, 12 Oct 2014 (MIT)
Python recipes (4591)
FB36's recipes (148)

Required Modules

  • (none specified)

Other Information and Tasks