Welcome, guest | Sign In | My Account | Store | Cart

Shannon-Fano Data Compression. It can compress any kind of file up to 4 GB.

(But trying to compress an already compressed file like zip, jpg etc. can produce a (slightly) larger file though. Shannon-Fano is not the best data compression algorithm anyway.)

Python, 154 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Shannon-Fano Data Compression
# http://en.wikipedia.org/wiki/Shannon%E2%80%93Fano_coding
# (Max compressible file size: 2**32 bytes)
# FB - 201012153
import sys
import os

def shannon_fano_encoder(iA, iB): # iA to iB : index interval
    global tupleList
    size = iB - iA + 1
    if size > 1:
        # Divide the list into 2 groups.
        # Top group will get 0, bottom 1 as the new encoding bit.
        mid = int(size / 2 + iA)
        for i in range(iA, iB + 1):
            tup = tupleList[i]
            if i < mid: # top group
                tupleList[i] = (tup[0], tup[1], tup[2] + '0')
            else: # bottom group
                tupleList[i] = (tup[0], tup[1], tup[2] + '1')
        # do recursive calls for both groups
        shannon_fano_encoder(iA, mid - 1)
        shannon_fano_encoder(mid, iB)

def byteWriter(bitStr, outputFile):
    global bitStream
    bitStream += bitStr
    while len(bitStream) > 8: # write byte(s) if there are more then 8 bits
        byteStr = bitStream[:8]
        bitStream = bitStream[8:]
        outputFile.write(chr(int(byteStr, 2)))

def bitReader(n): # number of bits to read
    global byteArr
    global bitPosition
    bitStr = ''
    for i in range(n):
        bitPosInByte = 7 - (bitPosition % 8)
        bytePosition = int(bitPosition / 8)
        byteVal = byteArr[bytePosition]
        bitVal = int(byteVal / (2 ** bitPosInByte)) % 2
        bitStr += str(bitVal)
        bitPosition += 1 # prepare to read the next bit
    return bitStr

# MAIN
if len(sys.argv) != 4:
    print 'Usage: ShannonFano.py [e|d] [path]InputFileName [path]OutputFileName'
    sys.exit()
mode = sys.argv[1] # encoding/decoding
inputFile = sys.argv[2]
outputFile = sys.argv[3]

# read the whole input file into a byte array
fileSize = os.path.getsize(inputFile)
fi = open(inputFile, 'rb')
# byteArr = map(ord, fi.read(fileSize))
byteArr = bytearray(fi.read(fileSize))
fi.close()
fileSize = len(byteArr)
print 'File size in bytes:', fileSize
print

if mode == 'e': # FILE ENCODING
    # calculate the total number of each byte value in the file
    freqList = [0] * 256
    for b in byteArr:
        freqList[b] += 1

    # create a list of (frequency, byteValue, encodingBitStr) tuples
    tupleList = []
    for b in range(256):
        if freqList[b] > 0:
            tupleList.append((freqList[b], b, ''))

    # sort the list according to the frequencies descending
    tupleList = sorted(tupleList, key=lambda tup: tup[0], reverse = True)

    shannon_fano_encoder(0, len(tupleList) - 1)
    # print 'The list of (frequency, byteValue, encodingBitStr) tuples:'
    # print tupleList
    # print

    # create a dictionary of byteValue : encodingBitStr pairs
    dic = dict([(tup[1], tup[2]) for tup in tupleList])
    del tupleList # unneeded anymore
    # print 'The dictionary of byteValue : encodingBitStr pairs:'
    # print dic

    # write a list of (byteValue,3-bit(len(encodingBitStr)-1),encodingBitStr)
    # tuples as the compressed file header
    bitStream = ''
    fo = open(outputFile, 'wb')
    fo.write(chr(len(dic) - 1)) # first write the number of encoding tuples
    for (byteValue, encodingBitStr) in dic.iteritems():
        # convert the byteValue into 8-bit and send to be written into file
        bitStr = bin(byteValue)
        bitStr = bitStr[2:] # remove 0b
        bitStr = '0' * (8 - len(bitStr)) + bitStr # add 0's if needed for 8 bits
        byteWriter(bitStr, fo)
        # convert len(encodingBitStr) to 3-bit and send to be written into file
        bitStr = bin(len(encodingBitStr) - 1) # 0b0 to 0b111
        bitStr = bitStr[2:] # remove 0b
        bitStr = '0' * (3 - len(bitStr)) + bitStr # add 0's if needed for 3 bits
        byteWriter(bitStr, fo)
        # send encodingBitStr to be written into file
        byteWriter(encodingBitStr, fo)

    # write 32-bit (input file size)-1 value
    bitStr = bin(fileSize - 1)
    bitStr = bitStr[2:] # remove 0b
    bitStr = '0' * (32 - len(bitStr)) + bitStr # add 0's if needed for 32 bits
    byteWriter(bitStr, fo)

    # write the encoded data
    for b in byteArr:
        byteWriter(dic[b], fo)

    byteWriter('0' * 8, fo) # to write the last remaining bits (if any)
    fo.close()

elif mode == 'd': # FILE DECODING
    bitPosition = 0
    n = int(bitReader(8), 2) + 1 # first read the number of encoding tuples
    # print 'Number of encoding tuples:', n
    dic = dict()
    for i in range(n):
        # read the byteValue
        byteValue = int(bitReader(8), 2)
        # read 3-bit(len(encodingBitStr)-1) value
        m = int(bitReader(3), 2) + 1
        # read encodingBitStr
        encodingBitStr = bitReader(m)
        dic[encodingBitStr] = byteValue # add to the dictionary
    # print 'The dictionary of encodingBitStr : byteValue pairs:'
    # print dic
    # print

    # read 32-bit file size (number of encoded bytes) value
    numBytes = long(bitReader(32), 2) + 1
    print 'Number of bytes to decode:', numBytes
    
    # read the encoded data, decode it, write into the output file
    fo = open(outputFile, 'wb')
    for b in range(numBytes):
        # read bits until a decoding match is found
        encodingBitStr = ''
        while True:
            encodingBitStr += bitReader(1)
            if encodingBitStr in dic:
                byteValue = dic[encodingBitStr]
                fo.write(chr(byteValue))
                break
    fo.close()

2 comments

bartender 11 years, 2 months ago  # | flag

Great Solution, but i had one doubt: On line 14,instead of mid we should find an index which minimises the difference in sum of left half and right half as per the algorithm (Step 3) on Wiki:

3.Divide the list into two parts, with the total frequency counts of the left part being as close to the total of the right as possible.

Any thoughts?

FB36 (author) 11 years, 2 months ago  # | flag

It looks like you are right.

I think it can be implemented as looping thru all possible index vales; at each value calculate frequency totals for left and right side and find the difference; finally choose the index value that gives the min difference.

I am not sure this change would provide any significant difference in compressed file size though.

Created by FB36 on Thu, 16 Dec 2010 (MIT)
Python recipes (4591)
FB36's recipes (148)

Required Modules

  • (none specified)

Other Information and Tasks