Shannon-Fano Data Compression. It can compress any kind of file up to 4 GB.
(But trying to compress an already compressed file like zip, jpg etc. can produce a (slightly) larger file though. Shannon-Fano is not the best data compression algorithm anyway.)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | # Shannon-Fano Data Compression
# http://en.wikipedia.org/wiki/Shannon%E2%80%93Fano_coding
# (Max compressible file size: 2**32 bytes)
# FB - 201012153
import sys
import os
def shannon_fano_encoder(iA, iB): # iA to iB : index interval
global tupleList
size = iB - iA + 1
if size > 1:
# Divide the list into 2 groups.
# Top group will get 0, bottom 1 as the new encoding bit.
mid = int(size / 2 + iA)
for i in range(iA, iB + 1):
tup = tupleList[i]
if i < mid: # top group
tupleList[i] = (tup[0], tup[1], tup[2] + '0')
else: # bottom group
tupleList[i] = (tup[0], tup[1], tup[2] + '1')
# do recursive calls for both groups
shannon_fano_encoder(iA, mid - 1)
shannon_fano_encoder(mid, iB)
def byteWriter(bitStr, outputFile):
global bitStream
bitStream += bitStr
while len(bitStream) > 8: # write byte(s) if there are more then 8 bits
byteStr = bitStream[:8]
bitStream = bitStream[8:]
outputFile.write(chr(int(byteStr, 2)))
def bitReader(n): # number of bits to read
global byteArr
global bitPosition
bitStr = ''
for i in range(n):
bitPosInByte = 7 - (bitPosition % 8)
bytePosition = int(bitPosition / 8)
byteVal = byteArr[bytePosition]
bitVal = int(byteVal / (2 ** bitPosInByte)) % 2
bitStr += str(bitVal)
bitPosition += 1 # prepare to read the next bit
return bitStr
# MAIN
if len(sys.argv) != 4:
print 'Usage: ShannonFano.py [e|d] [path]InputFileName [path]OutputFileName'
sys.exit()
mode = sys.argv[1] # encoding/decoding
inputFile = sys.argv[2]
outputFile = sys.argv[3]
# read the whole input file into a byte array
fileSize = os.path.getsize(inputFile)
fi = open(inputFile, 'rb')
# byteArr = map(ord, fi.read(fileSize))
byteArr = bytearray(fi.read(fileSize))
fi.close()
fileSize = len(byteArr)
print 'File size in bytes:', fileSize
print
if mode == 'e': # FILE ENCODING
# calculate the total number of each byte value in the file
freqList = [0] * 256
for b in byteArr:
freqList[b] += 1
# create a list of (frequency, byteValue, encodingBitStr) tuples
tupleList = []
for b in range(256):
if freqList[b] > 0:
tupleList.append((freqList[b], b, ''))
# sort the list according to the frequencies descending
tupleList = sorted(tupleList, key=lambda tup: tup[0], reverse = True)
shannon_fano_encoder(0, len(tupleList) - 1)
# print 'The list of (frequency, byteValue, encodingBitStr) tuples:'
# print tupleList
# print
# create a dictionary of byteValue : encodingBitStr pairs
dic = dict([(tup[1], tup[2]) for tup in tupleList])
del tupleList # unneeded anymore
# print 'The dictionary of byteValue : encodingBitStr pairs:'
# print dic
# write a list of (byteValue,3-bit(len(encodingBitStr)-1),encodingBitStr)
# tuples as the compressed file header
bitStream = ''
fo = open(outputFile, 'wb')
fo.write(chr(len(dic) - 1)) # first write the number of encoding tuples
for (byteValue, encodingBitStr) in dic.iteritems():
# convert the byteValue into 8-bit and send to be written into file
bitStr = bin(byteValue)
bitStr = bitStr[2:] # remove 0b
bitStr = '0' * (8 - len(bitStr)) + bitStr # add 0's if needed for 8 bits
byteWriter(bitStr, fo)
# convert len(encodingBitStr) to 3-bit and send to be written into file
bitStr = bin(len(encodingBitStr) - 1) # 0b0 to 0b111
bitStr = bitStr[2:] # remove 0b
bitStr = '0' * (3 - len(bitStr)) + bitStr # add 0's if needed for 3 bits
byteWriter(bitStr, fo)
# send encodingBitStr to be written into file
byteWriter(encodingBitStr, fo)
# write 32-bit (input file size)-1 value
bitStr = bin(fileSize - 1)
bitStr = bitStr[2:] # remove 0b
bitStr = '0' * (32 - len(bitStr)) + bitStr # add 0's if needed for 32 bits
byteWriter(bitStr, fo)
# write the encoded data
for b in byteArr:
byteWriter(dic[b], fo)
byteWriter('0' * 8, fo) # to write the last remaining bits (if any)
fo.close()
elif mode == 'd': # FILE DECODING
bitPosition = 0
n = int(bitReader(8), 2) + 1 # first read the number of encoding tuples
# print 'Number of encoding tuples:', n
dic = dict()
for i in range(n):
# read the byteValue
byteValue = int(bitReader(8), 2)
# read 3-bit(len(encodingBitStr)-1) value
m = int(bitReader(3), 2) + 1
# read encodingBitStr
encodingBitStr = bitReader(m)
dic[encodingBitStr] = byteValue # add to the dictionary
# print 'The dictionary of encodingBitStr : byteValue pairs:'
# print dic
# print
# read 32-bit file size (number of encoded bytes) value
numBytes = long(bitReader(32), 2) + 1
print 'Number of bytes to decode:', numBytes
# read the encoded data, decode it, write into the output file
fo = open(outputFile, 'wb')
for b in range(numBytes):
# read bits until a decoding match is found
encodingBitStr = ''
while True:
encodingBitStr += bitReader(1)
if encodingBitStr in dic:
byteValue = dic[encodingBitStr]
fo.write(chr(byteValue))
break
fo.close()
|
Great Solution, but i had one doubt: On line 14,instead of mid we should find an index which minimises the difference in sum of left half and right half as per the algorithm (Step 3) on Wiki:
3.Divide the list into two parts, with the total frequency counts of the left part being as close to the total of the right as possible.
Any thoughts?
It looks like you are right.
I think it can be implemented as looping thru all possible index vales; at each value calculate frequency totals for left and right side and find the difference; finally choose the index value that gives the min difference.
I am not sure this change would provide any significant difference in compressed file size though.