Diff JAR / WAR / ZIP files (even recursively bundled).
Possibility to ignore some files, or some patterns in text files (like MANIFEST.mf).
This script is useful in order to know if a new build has changed anything to your binary JAR/WAR/Zip.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | #!/usr/bin/python
#
# Author : Raphael Jolivet
# Release : 22-mar-2011
import sys
import re
from zipfile import ZipFile
from fnmatch import fnmatch
from StringIO import StringIO
# ----------------------------------------------------------------------------
# Class IgnoreRules
# ----------------------------------------------------------------------------
"""
Ignore Rules class.
Made of list of 'glob' filename pattern to ignore whole files,
and a dictionnary of <'glob' filename patterns> => [list of regexp patterns] to ignore some lines in specific text files.
"""
class IgnoreRules :
def __init__(
self,
ignoreFiles = [], # List of file patterns to ignore (ala 'glob' with ? and * wildcards)
ignorePatternsPerFile = {} # Map of <file glob pattern> => [list of regexp patterns] for lines to ignore in some text files
) :
self.ignoreFiles = ignoreFiles
# List of regexp patterns to ignore in some files (with ? and * patterns in filenames)
# Compile the patterns
self.ignorePatternsPerFile = {}
for key in ignorePatternsPerFile.keys() :
self.ignorePatternsPerFile[key] = []
for pattern in ignorePatternsPerFile[key] :
self.ignorePatternsPerFile[key].append(re.compile(pattern))
# ----------------------------------------------------------------------------
# Config
# ----------------------------------------------------------------------------
RULES = IgnoreRules(
# Ignored files
[
"*/README.txt",
"*.java" # Source files
],
# Lines ignored in text files
{
# Version information within manifests
"META-INF/MANIFEST.MF" : [
'^Implementation-Version\s*:.*$',
'^Implementation-Build-Time\s*:.*$',
'^Implementation-Revision\s*:.*$'],
# Comments within INI files : "; Blabla"
"*.ini" : [
'^\s*;.*$']
}
)
# ----------------------------------------------------------------------------
# Main method
# ----------------------------------------------------------------------------
"""
Diff between two zipfiles
Returns None if files are the same
Return a string describing the first diff encountered otherwise
"""
def diffZips(zip1, zip2, ignoreRules) :
# Build maps of entries
zip1Map = {}
for entry in zip1.infolist() :
zip1Map[entry.filename] = entry
zip2Map = {}
for entry in zip2.infolist() :
zip2Map[entry.filename] = entry
# Check we have same list of files
zip1KeySet = set(zip1Map.keys())
zip2KeySet = set(zip2Map.keys())
if zip1KeySet != zip2KeySet :
return "Different list of entries" + zip1KeySet.symmetric_difference(zip1KeySet)
# Loop on entries
for filename in zip1KeySet :
# Is it a folder => Then no diffs, its ocntents will be checked anyway
if filename.endswith('/') : continue
# Get each entry
entry1 = zip1Map[filename]
entry2 = zip2Map[filename]
# Is it a bundled zip ?
if fnmatch(filename, "*.zip") or fnmatch(filename, "*.war") or fnmatch(filename, "*.jar") :
# Same CRC and size ? They are identic : No need to look into it
if entry1.file_size == entry2.file_size and entry1.CRC == entry2.CRC : continue
# Open the files as ZipFiles
subZip1 = ZipFile(StringIO(zip1.read(entry1)))
subZip2 = ZipFile(StringIO(zip2.read(entry2)))
# Recursively diff them
diff = diffZips(subZip1, subZip2, ignoreRules)
# Close zip files
subZip1.close()
subZip2.close()
# Diff found => exit
if diff != None : return "In %s : %s" % (filename, diff)
# No diff here : skip no next one
continue
# Do we ignore this file ?
ignore = False
for pattern in ignoreRules.ignoreFiles :
if fnmatch(filename, pattern) :
ignore = True
break
if ignore : continue # File ignored => check next entry
# Is it a text file ?
textFile = False
for pattern in ignoreRules.ignorePatternsPerFile.keys() :
if fnmatch(filename, pattern) :
textFile = True
# Open the files and check their lines
file1 = zip1.open(entry1)
file2 = zip2.open(entry2)
result = diffTextFiles(
file1,
file2,
ignoreRules.ignorePatternsPerFile[pattern])
file1.close()
file2.close()
if result != None :
return "Text files %s are not the same : %s" % (filename, result)
else :
break
# This was a text file ? => already checked => continue
if textFile : continue
# -- Binary file ?
# Check size
if entry1.file_size != entry2.file_size :
return "Entry '%s' has different sizes : %d <> %d" % (filename, entry1.file_size, entry2.file_size)
# Check CRC
if entry1.CRC != entry2.CRC :
return "Entry '%s' has different CRCs : %s <> %s" % (filename, entry1.CRC, entry2.CRC)
#else :
# print "File %s CRC1=%s, CRC2=%s" % (filename, entry1.CRC, entry2.CRC)
# End of loop on entries
# No diff found here
return None
# Diff two text files,
# Ignoring some lines
# return None if files are identic, a string describing the diff otherwise
def diffTextFiles(file1, file2, ignorePatterns) :
lineNo = 0
while True :
# Get next lines
line1 = file1.readline().strip()
line2 = file2.readline().strip()
lineNo += 1
# We reached the end
if len(line1) == 0 and len(line2) == 0 : return None
# Replace ignore patterns
for pattern in ignorePatterns :
if pattern.match(line1) != None :
line1 = "#IGNORED"
if pattern.match(line2) != None :
line2 = "#IGNORED"
if line1 != line2 : return "Line %d differ : '%s' <> '%s'" % (lineNo, line1, line2)
# --------------------------------------------------------------------
# Main
# --------------------------------------------------------------------
if __name__ == "__main__":
# Get arguments, create zipfiles
zip1 = ZipFile(sys.argv[1], 'r')
zip2 = ZipFile(sys.argv[2], 'r')
# Diff zipfiles
result = diffZips(
zip1,
zip2,
RULES)
# If diff : print diff description, status=1
# If no diff : print nothing, status=1
if result == None :
sys.exit(0)
else:
print result
sys.exit(1)
|
Algorithm
The two archive files (JAR/ WAR or ZIP) are open. Their TOC are compared. If they have the same list of entries (TOC), each entry is compared this way :
- Text files are compared considering the "ignore lines patterns"
- Folder entries are ignored (their contents are compared anyway)
- Binary files : Size and CRC hash are compared.
- ZIP, WAR or JAR folder. If their CRC or size differ, look recursively into it.
CLI Usage
Here is the command line usage :
jardiff <file1.jar> <file2.jar>
The configuration of ignored files and ignored text lines can be changed in the beginning of the script, in the 'Config' section.
API usage
When called from a python script, you should use the main function diffZips
like this :
diffZips(
zipFile1,
zipFile2,
IgnoreRules(
# Ignored files
[
"*/README.txt",
"*.java" # Source files
],
# Lines ignored in text files
{
# Version information within manifests
"META-INF/MANIFEST.MF" : [
'^Implementation-Version\s*:.*$',
'^Implementation-Build-Time\s*:.*$',
'^Implementation-Revision\s*:.*$'],
# Comments within INI files : "; Blabla"
"*.ini" : [
'^\s*;.*$']})
)
Known bugs
The comparison of binary files is based on CRC & File size. Theoretically, two files could have the same file size and CRC and still be different, but this is very unlikely : Even more unlikely considering they should both valid resource or class files.
I was getting an error on OS X (10.7.4) Python 2.7.1. I am not a Python dev by any stretch of the imagination so take this with a grain of salt. I ran it from the command-line in the form: 'jardiff.py jar1 jar2'
Traceback (most recent call last): File "jardiff.py", line 212, in <module> RULES) File "jardiff.py", line 88, in diffZips return "Different list of entries" + zip1KeySet.symmetric_difference(zip1KeySet) TypeError: cannot concatenate 'str' and 'set' objects
I fixed it by updating line 88 to be: return "Different list of entries: " + ', '.join(zip1KeySet.symmetric_difference(zip2KeySet))
There were 2 issues with the line before: 1) The attempt to concatenate a string and a set 2) The symmetric_difference was comparing the set to itself
The above updated line fixes those 2 issues.