Welcome, guest | Sign In | My Account | Store | Cart

Shannon entropy of a string indicates minimum average number of bits per symbol required for encoding (compressing) the string.

Python, 41 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Shannon Entropy of a string
# = minimum average number of bits per symbol
# required for encoding the string
#
# So the theoretical limit for data compression:
# Shannon Entropy of the string * string length
# FB - 201011291
import math
from sets import Set

st = 'aabcddddefffg' # input string
# st = '00010101011110' # Shannon entropy for this would be 1 bit/symbol

print 'Input string:'
print st
print
stList = list(st)
alphabet = list(Set(stList)) # list of symbols in the string
print 'Alphabet of symbols in the string:'
print alphabet
print
# calculate the frequency of each symbol in the string
freqList = []
for symbol in alphabet:
    ctr = 0
    for sym in stList:
        if sym == symbol:
            ctr += 1
    freqList.append(float(ctr) / len(stList))
print 'Frequencies of alphabet symbols:'
print freqList
print
# Shannon entropy
ent = 0.0
for freq in freqList:
    ent = ent + freq * math.log(freq, 2)
ent = -ent
print 'Shannon entropy:'
print ent
print 'Minimum number of bits required to encode each symbol:'
print int(math.ceil(ent))

13 comments

FB36 (author) 11 years, 7 months ago  # | flag

Since Shannon entropy allows calculation of theoretical data compression limit, it can be used to calculate efficiency percentage of any data compression software (for a particular input data file):

compression efficiency percentage = ((original file size) - (compressed file size))/((original file size) - (min file size possible)) * 100

(min file size possible) = SE * (file size)

(Actually this efficiency calculation is only approximate because (min file size possible) excludes decompression key size which in reality included in the compressed file.)

FB36 (author) 11 years, 7 months ago  # | flag

Actually it is:

(min file size possible) = SE * (file size) / 8

because SE value means 'min bits per byte-character' in the file. So the result is in bits, which must be divided by 8 to convert to bytes.

FB36 (author) 11 years, 7 months ago  # | flag

This code can be used to calculate Shannon Entropy of file:

# file_entropy.py
#
# Shannon Entropy of a file
# = minimum average number of bits per character
# required for encoding (compressing) the file
#
# So the theoretical limit (in bytes) for data compression:
# Shannon Entropy of the file * file size (in bytes) / 8
# (Assuming the file is a string of byte-size (UTF-8?) characters
# because if not then the Shannon Entropy value would be different.)
# FB - 201011291
import sys
import math
if len(sys.argv) != 2:
    print "Usage: file_entropy.py [path]filename"
    sys.exit()

# read the whole file into a byte array
f = open(sys.argv[1], "rb")
byteArr = map(ord, f.read())
f.close()
fileSize = len(byteArr)
print 'File size in bytes:'
print fileSize
print

# calculate the frequency of each byte value in the file
freqList = []
for b in range(256):
    ctr = 0
    for byte in byteArr:
        if byte == b:
            ctr += 1
    freqList.append(float(ctr) / fileSize)
# print 'Frequencies of each byte-character:'
# print freqList
# print

# Shannon entropy
ent = 0.0
for freq in freqList:
    if freq > 0:
        ent = ent + freq * math.log(freq, 2)
ent = -ent
print 'Shannon entropy (min bits per byte-character):'
print ent
print
print 'Min possible file size assuming max theoretical compression efficiency:'
print (ent * fileSize), 'in bits'
print (ent * fileSize) / 8, 'in bytes'
FB36 (author) 11 years, 6 months ago  # | flag

Shannon Entropy calculation for the same input string using 2/4/8/16 bits/symbol:

# Shannon Entropy calculation for the same input string 
# using 2/4/8/16 bits/symbol
# FB - 201012083
import math
import random
import string

n = 10 # arbitrary
strLen = 16 * n
print 'Number of bits in the input string:', strLen
print
# generate strLen random bits as the input string
bits = [random.randint(0, 1) for i in range(strLen)]
print 'Input string of random bits:'
print string.join([str(bit) for bit in bits])
print

for i in range(4):
    bitLen = 2 ** (i + 1)
    print 'Using', bitLen, 'bits/symbol:'
    # generate frequency table for bitLen bits/symbol
    freqLen = 2 ** bitLen # size of the frequency table
    freqList = [0] * freqLen
    numSym = strLen / bitLen
    print 'Number of symbols in the input string:', numSym
    for j in range(numSym):
        sym = ''
        for k in range(bitLen):
            sym += str(bits[bitLen * j + k])
        # print 'Input symbol', j, ': ', sym
        freqList[int(sym, 2)] += 1
    print
    for m in range(freqLen):
        freqList[m] = float(freqList[m]) / numSym
    # print 'Frequency table:'
    # print freqList
    # print

    # Shannon Entropy
    ent = 0.0
    for freq in freqList:
        if freq > 0:
            ent = ent + freq * math.log(freq, 2)
    ent = -ent
    print 'Shannon Entropy (min number of bits needed to encode each symbol):'
    print ent
    print
    print
FB36 (author) 11 years, 6 months ago  # | flag

If these lines added to the end of above code then it would also calculate min possible string size after max theoretical data compression:

    print 'Min possible string size after max theoretical data compression:'
    print ent * numSym
    print
    print
FB36 (author) 11 years, 6 months ago  # | flag

Min possible string size is in bits.

FB36 (author) 11 years, 6 months ago  # | flag

As the above code would show, max possible compressed size of a data string depends on bits/symbol value chosen. So if a data compression algorithm, that allows for any bits/symbol value to be used, existed then Shannon entropy values for different bits/symbol could be used to choose bits/symbol value that produces the smallest compressed file size. (But of course this would also increase compression time a lot!)

FB36 (author) 11 years, 6 months ago  # | flag

Shannon Entropy of an image (calculated for red, green, blue separately):

# Shannon Entropy of an Image
# (min number of bits required to encode each color)
# FB - 201012131
import math
from PIL import Image
imageFile = 'test.jpg'
im = Image.open(imageFile)
rgbHistogram = im.histogram()
print 'Snannon Entropy for Red, Green, Blue:'
for rgb in range(3):
    totalPixels = sum(rgbHistogram[rgb * 256 : (rgb + 1) * 256])
    ent = 0.0
    for col in range(rgb * 256, (rgb + 1) * 256):
        freq = float(rgbHistogram[col]) / totalPixels
        if freq > 0:
            ent = ent + freq * math.log(freq, 2)
    ent = -ent
    print ent
FB36 (author) 11 years, 6 months ago  # | flag

Shannon Entropy of a file (this code is more efficient then the previous one):

# Shannon Entropy of a file
# FB - 201012153
import sys
import math
if len(sys.argv) != 2:
    print 'Usage: file_entropy.py [path]filename'
    sys.exit()
f = open(sys.argv[1], 'rb')
byteArr = bytearray(f.read())
f.close()
fileSize = len(byteArr)
print 'File size in bytes:', fileSize
print

freqList = [0] * 256
for b in byteArr:
    freqList[b] += 1

ent = 0.0
for f in freqList:
    if f > 0:
        freq = float(f) / fileSize
        ent = ent + freq * math.log(freq, 2)
ent = -ent
print 'Shannon entropy (min bits per byte-character):', ent
print
print 'Min possible file size assuming max theoretical compression efficiency:'
print (ent * fileSize) / 8, 'bytes'
Sam Khan 9 years, 4 months ago  # | flag

thank you :)

Brandon Lashmet 9 years ago  # | flag

I'm trying to run the code in post "Shannon Entropy of a file (this code is more efficient then the previous one)", however, it gives a syntax error in Python.

What modifications do I need to make for it to scan a specific file? Sorry for the noobish question. Thanks! :)

FB36 (author) 9 years ago  # | flag

It is designed to be run from (Windows/Linux) command prompt.

First install latest Python 2.x version.

Save that code as file_entropy.py file.

Add path of Python directory to your Windows/Linux path.

Open command prompt.

Type:

python [Path to file_entropy.py]file_entropy.py [Pathtoyourfiletobeanalyzed][Nameofthefiletobeanalyzed]

Or

file_entropy.py [Path to file_entropy.py][Pathtoyourfiletobeanalyzed][Nameofthefiletobeanalyzed].

I just run again and there is no syntax error.

(If you still get it please post me the exact error message you see.)

FB36 (author) 6 years, 10 months ago  # | flag
# Shannon Entropy of a file using 1/2/4/8/16 bits/symbol
# FB - 20150809
import math
import sys
import os

if len(sys.argv) != 2:
    print 'Usage: file_entropy_N.py [path]filename'
    sys.exit()
inputFileName = sys.argv[1]
fileSize = os.path.getsize(inputFileName)
f = open(inputFileName, 'rb')
byteArr = bytearray(f.read(fileSize))
f.close()
fileSize = len(byteArr)
print 'File size in bytes:', fileSize
print
strLen = fileSize * 8 # file size in bits
for i in range(5):
    bitLen = 2 ** i
    print 'Using', bitLen, 'bits/symbol:'
    # generate frequency table for bitLen bits/symbol
    freqLen = 2 ** bitLen # size of the frequency table
    freqList = [0] * freqLen
    numSym = strLen / bitLen
    print 'Number of symbols in the input string:', numSym
    for j in range(numSym):
        sym = ''
        for k in range(bitLen):
            bitInd = bitLen * j + k
            byteInd = int(bitInd / 8)
            if byteInd < fileSize:
                byteVal = byteArr[byteInd]
            else:
                byteVal = 0
            bitInByteInd = 7 - bitInd % 8
            bitVal = int(byteVal / 2 ** bitInByteInd) % 2
            sym += str(bitVal)
        freqList[int(sym, 2)] += 1
    for m in range(freqLen):
        freqList[m] = float(freqList[m]) / numSym
    # print 'Frequency table:'
    # print freqList
    # print

    ent = 0.0
    ctr = 0
    for freq in freqList:
        if freq > 0:
            ent = ent + freq * math.log(freq, 2)
            ctr += 1
    ent = -ent
    print 'Number of unique symbols in the input string:', ctr
    print 'Shannon Entropy (min number of bits needed to encode each symbol):'
    print ent
    print 'Min possible file size assuming max theoretical data compression:'
    print (ent * numSym) / 8, 'bytes'
    print
Created by FB36 on Mon, 29 Nov 2010 (MIT)
Python recipes (4591)
FB36's recipes (148)

Required Modules

  • (none specified)

Other Information and Tasks