A script to compare and diff two md5 files.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import re
import sys
import string
import os.path
import time
"""
Reads the content of the two md5 files in two lists.
The lists content 'll be [md5, path, filename] :
* md5 : the md5, untouched
* path : the path, normalized (only forward slashes), filtered with the selected regular expression
* filename : the filename, untouched
"""
# re_compiled = re.compile('^\.svn|/\.svn') # re to exclude all .svn directory (subversion administrative dirs)
re_compiled = re.compile('^$') # re to exclude nothing
list_tmp = [line[:-1] for line in open(sys.argv[1],mode='rU').readlines()] # reading first file
list_one = [] # initializing the first list
for x in list_tmp: # writing the first list
if x[0:1] <> '#' and x[0:1] <> ';' and x.strip() <> '' : # skip comment lines (starting with '#' or ';') and empty lines
str_md5 = x[:32] # extract the md5
str_path = string.replace(os.path.dirname(x[34:]),'\\','/') # extract the path
if str_path[0:1] == '/' : str_path = str_path [1:] # remove the trailing '/' from the path
str_filename = os.path.basename(x[34:]) # extract the filename
if not re_compiled.search(str_path): # exclude paths matching the re
list_one.append((str_md5, str_path, str_filename))
list_tmp = [line[:-1] for line in open(sys.argv[2],mode='rU').readlines()] # reading second file
list_two = [] # initializing the first list
for x in list_tmp: # writing the first list
if x[0:1] <> '#' and x[0:1] <> ';' and x.strip() <> '' and not re_compiled.search(x): # skip comment lines (starting with '#' or ';') and empty lines
str_md5 = x[:32] # extract the md5
str_path = string.replace(os.path.dirname(x[34:]),'\\','/') # extract the path
if str_path[0:1] == '/' : str_path = str_path [1:] # remove the trailing '/' from the path
str_filename = os.path.basename(x[34:]) # extract the filename
if not re_compiled.search(str_path): # exclude paths matching the re
list_two.append((str_md5, str_path, str_filename))
list_tmp = [] # erasing the temp list
"""
Diff the two lists, obtaining two list 'list_xxx_diff' (md5, flag, path, filename) :
* flag :
** '==' equal (md5 =, path =, filename =)
** '<>' different (md5 <>, path =, filename =)
** '>>' new dx (md5 n/a, path <>, filename <>)
** '<<' new sx (md5 n/a, path <>, filename <>)
** 'm>' moved dx (md5 =, path <>, filename =)
** '<m' moved sx (md5 =, path <>, filename =)
** 'r>' renamed dx (md5 =, path =, filename <>)
** '<r' renamed sx (md5 =, path =, filename <>)
* first md5 : the md5 of the first md5 file
* second md5 : the md5 of the second md5 file
* path : path
* filename : filename
"""
# creating the two list containing the result of the diff'ing
list_one_diff = []
list_two_diff = []
# searching for '==' equal (md5 =, path =, filename =)
for x in range(len(list_one)):
item_one = (list_one[x][0], list_one[x][1],list_one[x][2])
for y in range(len(list_two)):
item_two = (list_two[y][0], list_two[y][1],list_two[y][2])
if item_one == item_two:
list_one_diff.append([list_one[x][0], '==', list_one[x][1], list_one[x][2]]) # write in the first diff'ing result list
list_two_diff.append([list_two[y][0], '==', list_two[y][1], list_two[y][2]]) # write in the second diff'ing result list
list_one[x]=[] # mark the first list element for removing
del list_two[y] # remove the current item from the second list
break # return to the upper for loop
list_one = [x for x in list_one if x <> []] # remove marked items from the first list
# searching for '<>' different (md5 <>, path =, filename =)
for x in range(len(list_one)):
item_one = (list_one[x][1],list_one[x][2])
for y in range(len(list_two)):
item_two = (list_two[y][1],list_two[y][2])
if item_one == item_two:
list_one_diff.append([list_one[x][0], '<>', list_one[x][1], list_one[x][2]]) # write in the first diff'ing result list
list_two_diff.append([list_two[y][0], '<>', list_two[y][1], list_two[y][2]]) # write in the second diff'ing result list
list_one[x]=[] # mark the first list element for removing
del list_two[y] # remove the current item from the second list
break # return to the upper for loop
list_one = [x for x in list_one if x <> []] # remove marked items from the first list
# searching for 'm>' moved dx and '<m' moved sx (md5 =, path <>, filename =)
for x in range(len(list_one)):
item_one = (list_one[x][0],list_one[x][2])
for y in range(len(list_two)):
item_two = (list_two[y][0],list_two[y][2])
if item_one == item_two:
list_one_diff.append([list_one[x][0], '<m', list_one[x][1], list_one[x][2]]) # write in the first diff'ing result list
list_two_diff.append([list_two[y][0], 'm>', list_two[y][1], list_two[y][2]]) # write in the second diff'ing result list
list_one[x]=[] # mark the first list element for removing
del list_two[y] # remove the current item from the second list
break # return to the upper for loop
list_one = [x for x in list_one if x <> []] # remove marked items from the first list
# searching for 'r>' renamed dx and '<r' renamed sx (md5 =, path =, filename <>)
for x in range(len(list_one)):
item_one = (list_one[x][0],list_one[x][1])
for y in range(len(list_two)):
item_two = (list_two[y][0],list_two[y][1])
if item_one == item_two:
list_one_diff.append([list_one[x][0], '<r', list_one[x][1], list_one[x][2]]) # write in the first diff'ing result list
list_two_diff.append([list_two[y][0], 'r>', list_two[y][1], list_two[y][2]]) # write in the second diff'ing result list
list_one[x]=[] # mark the first list element for removing
del list_two[y] # remove the current item from the second list
break # return to the upper for loop
list_one = [x for x in list_one if x <> []] # remove marked items from the first list
# searching for '>>' new dx and '<<' new sx (md5 n/a, path <>, filename <>)
for x in range(len(list_one)):
list_one_diff.append([list_one[x][0], '<<', list_one[x][1], list_one[x][2]]) # write in the first diff'ing result list
for y in range(len(list_two)):
list_two_diff.append([list_two[y][0], '>>', list_two[y][1], list_two[y][2]]) # write in the second diff'ing result list
"""
Printing the diff'ing list and some stats
"""
# printing the first diff'ed md5
print "#\n# diff'ed md5 '" + sys.argv[1] +"' (" +time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + ")\n#" # prints the header of the first diff'ed md5 file
for x in range(len(list_one_diff)): # loops on the first diff'ing list, and print each element
print list_one_diff[x][0] + list_one_diff[x][1] + list_one_diff[x][2] + '/' + list_one_diff[x][3]
# printing the second diff'ed md5
print "\n\n#\n# diff'ed md5 '" + sys.argv[2] +"' (" +time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + ")\n#" # prints the header of the second diff'ed md5 file
for x in range(len(list_two_diff)): # loops on the first diff'ing list, and print each element
print list_two_diff[x][0] + list_two_diff[x][1] + list_two_diff[x][2] + '/' + list_two_diff[x][3]
# printing stats of the first list
list_stats = [x for (a,x,b,c) in list_one_diff] # create a list containing only flag from the first diff list, used to create stats
print "\n\n\n# *** stats of '" + sys.argv[1] +"' ***\n#" # print stats header
print '# == equal ', list_stats.count('==')
print '# <> different ', list_stats.count('<>')
print '# << new sx ', list_stats.count('<<')
print '# <r renamed sx ', list_stats.count('<r')
print '# <m moved sx ', list_stats.count('<m')
print '# -- total ', len(list_stats)
# printing stats of the second list
list_stats = [x for (a,x,b,c) in list_two_diff] # create a list containing only flag from the second diff list, used to create stats
print "\n\n# *** stats of '" + sys.argv[2] +"' ***\n#" # print stats header
print '# == equal ', list_stats.count('==')
print '# <> different ', list_stats.count('<>')
print '# >> new dx ', list_stats.count('>>')
print '# r> renamed dx ', list_stats.count('r>')
print '# m> moved dx ', list_stats.count('m>')
print '# -- total ', len(list_stats)
|
Script feature :
can load all kind (I hope ;-) of md5 files
the files inside the input md5 lists 'll be diff'ed and flagged as : * '==' : equal (md5 =, path =, filename =) * '<>' : different (md5 <>, path =, filename =) * '<<' or '>>' : new (md5 n/a, path <>, filename <>) * '<m' or 'm>' : moved (md5 =, path <>, filename =) ** '<r' or 'r>' : renamed (md5 =, path =, filename <>)
to the stdout 'll be printed : * the two input md5 files, added of the diff'ing flag; the only difference from a standard md5 is the diff'ing flag, printed between the md5 and the path/filename (chars 33,34) * some stats of the diff'ing
Some background :
I needed this script to compare md5 created with different programs, to catch files renaming and moving and... because programming in python is fun !!!