Welcome, guest | Sign In | My Account | Store | Cart

Data compression via Fibonacci encoding.

Python, 152 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Fibonacci Data Compression
# http://en.wikipedia.org/wiki/Fibonacci_coding
# (Max compressible file size: 2**32 bytes)
# FB - 201101274
import sys
import os

def encode_fib(n):
    # Return string with Fibonacci encoding for n (n >= 1).
    result = ""
    if n >= 1:
        a = 1
        b = 1
        c = a + b   # next Fibonacci number
        fibs = [b]  # list of Fibonacci numbers, starting with F(2), each <= n
        while n >= c:
            fibs.append(c)  # add next Fibonacci number to end of list
            a = b
            b = c
            c = a + b
        result = "1"  # extra "1" at end
        for fibnum in reversed(fibs):
            if n >= fibnum:
                n = n - fibnum
                result = "1" + result
            else:
                result = "0" + result
    return result

def byteWriter(bitStr, outputFile):
    global bitStream
    bitStream += bitStr
    while len(bitStream) > 8: # write byte(s) if there are more then 8 bits
        byteStr = bitStream[:8]
        bitStream = bitStream[8:]
        outputFile.write(chr(int(byteStr, 2)))

def bitReader(n): # number of bits to read
    global byteArr
    global bitPosition
    bitStr = ''
    for i in range(n):
        bitPosInByte = 7 - (bitPosition % 8)
        bytePosition = int(bitPosition / 8)
        byteVal = byteArr[bytePosition]
        bitVal = int(byteVal / (2 ** bitPosInByte)) % 2
        bitStr += str(bitVal)
        bitPosition += 1 # prepare to read the next bit
    return bitStr

# MAIN
if len(sys.argv) != 4:
    print 'Usage: Fibonacci.py [e|d] [path]InputFileName [path]OutputFileName'
    sys.exit()
mode = sys.argv[1] # encoding/decoding
inputFile = sys.argv[2]
outputFile = sys.argv[3]

# read the whole input file into a byte array
fileSize = os.path.getsize(inputFile)
fi = open(inputFile, 'rb')
# byteArr = map(ord, fi.read(fileSize))
byteArr = bytearray(fi.read(fileSize))
fi.close()
fileSize = len(byteArr)
print 'File size in bytes:', fileSize
print

if mode == 'e': # FILE ENCODING
    # calculate the total number of each byte value in the file
    freqList = [0] * 256
    for b in byteArr:
        freqList[b] += 1

    # create a list of (frequency, byteValue, encodingBitStr) tuples
    tupleList = []
    for b in range(256):
        if freqList[b] > 0:
            tupleList.append((freqList[b], b, ''))

    # sort the list according to the frequencies descending
    tupleList = sorted(tupleList, key=lambda tup: tup[0], reverse = True)

    # assign encoding bit strings to each byte value
    for b in range(len(tupleList)):
        tupleList[b] = (tupleList[b][0], tupleList[b][1], encode_fib(b + 1))
    # print 'The list of (frequency, byteValue, encodingBitStr) tuples:'
    # print tupleList
    # print

    # write the list of byte values as the compressed file header
    bitStream = '' # global
    fo = open(outputFile, 'wb')
    fo.write(chr(len(tupleList) - 1)) # first write the number of byte values
    for (freq, byteValue, encodingBitStr) in tupleList:
        # convert the byteValue into 8-bit and send to be written into file
        # bitStr = bin(byteValue)
        # bitStr = bitStr[2:] # remove 0b
        # bitStr = '0' * (8 - len(bitStr)) + bitStr # add 0's if needed for 8 bits
        # byteWriter(bitStr, fo)
        fo.write(chr(byteValue)) # this would do the same

    # write 32-bit (input file size)-1 value
    bitStr = bin(fileSize - 1)
    bitStr = bitStr[2:] # remove 0b
    bitStr = '0' * (32 - len(bitStr)) + bitStr # add 0's if needed for 32 bits
    byteWriter(bitStr, fo)

    # create a dictionary of byteValue : encodingBitStr pairs
    dic = dict([(tup[1], tup[2]) for tup in tupleList])
    # del tupleList
    # print 'The dictionary of byteValue : encodingBitStr pairs:'
    # print dic

    # write the encoded data
    for b in byteArr:
        byteWriter(dic[b], fo)

    byteWriter('0' * 8, fo) # to write the last remaining bits (if any)
    fo.close()

elif mode == 'd': # FILE DECODING
    bitPosition = 0 # global
    n = int(bitReader(8), 2) + 1 # first read the number of byte values
    # print 'Number of byte values:', n
    dic = dict()
    for i in range(n):
        # read the byteValue
        byteValue = int(bitReader(8), 2)
        encodingBitStr = encode_fib(i + 1)
        dic[encodingBitStr] = byteValue # add to the dictionary
    # print 'The dictionary of encodingBitStr : byteValue pairs:'
    # print dic
    # print

    # read 32-bit file size (number of encoded bytes) value
    numBytes = long(bitReader(32), 2) + 1
    print 'Number of bytes to decode:', numBytes
    
    # read the encoded data, decode it, write into the output file
    fo = open(outputFile, 'wb')
    for b in range(numBytes):
        # read bits until a decoding match is found
        encodingBitStr = ''
        while True:
            encodingBitStr += bitReader(1)
            # if encodingBitStr in dic:
            if encodingBitStr.endswith('11'):
                byteValue = dic[encodingBitStr]
                fo.write(chr(byteValue))
                break
    fo.close()

2 comments

FB36 (author) 13 years, 2 months ago  # | flag

The code (encoding part) works by encoding the most common byte value in the input data file using the shortest Fibonacci encoding (bit string) (which is the encoding of n=1); likewise the second most common byte value encoded using the Fibonacci encoding of n=2, and so on.

FB36 (author) 13 years, 2 months ago  # | flag

The encoding part also writes the list of all byte values (max 256) (which encountered in the input file), sorted according to most common to least common order, to the compressed file as the header, so the decoding part later can read those byte values to figure out which Fibonacci code must be decoded as which byte value.

Created by FB36 on Fri, 28 Jan 2011 (MIT)
Python recipes (4591)
FB36's recipes (148)

Required Modules

  • (none specified)

Other Information and Tasks