Welcome, guest | Sign In | My Account | Store | Cart

Shannon entropy of a string indicates minimum average number of bits per symbol required for encoding (compressing) the string.

Python, 41 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Shannon Entropy of a string
# = minimum average number of bits per symbol
# required for encoding the string
#
# So the theoretical limit for data compression:
# Shannon Entropy of the string * string length
# FB - 201011291
import math
from sets import Set

st = 'aabcddddefffg' # input string
# st = '00010101011110' # Shannon entropy for this would be 1 bit/symbol

print 'Input string:'
print st
print
stList = list(st)
alphabet = list(Set(stList)) # list of symbols in the string
print 'Alphabet of symbols in the string:'
print alphabet
print
# calculate the frequency of each symbol in the string
freqList = []
for symbol in alphabet:
    ctr = 0
    for sym in stList:
        if sym == symbol:
            ctr += 1
    freqList.append(float(ctr) / len(stList))
print 'Frequencies of alphabet symbols:'
print freqList
print
# Shannon entropy
ent = 0.0
for freq in freqList:
    ent = ent + freq * math.log(freq, 2)
ent = -ent
print 'Shannon entropy:'
print ent
print 'Minimum number of bits required to encode each symbol:'
print int(math.ceil(ent))

13 comments

FB36 (author) 13 years, 5 months ago  # | flag

Since Shannon entropy allows calculation of theoretical data compression limit, it can be used to calculate efficiency percentage of any data compression software (for a particular input data file):

compression efficiency percentage = ((original file size) - (compressed file size))/((original file size) - (min file size possible)) * 100

(min file size possible) = SE * (file size)

(Actually this efficiency calculation is only approximate because (min file size possible) excludes decompression key size which in reality included in the compressed file.)

FB36 (author) 13 years, 5 months ago  # | flag

Actually it is:

(min file size possible) = SE * (file size) / 8

because SE value means 'min bits per byte-character' in the file. So the result is in bits, which must be divided by 8 to convert to bytes.

FB36 (author) 13 years, 5 months ago  # | flag

This code can be used to calculate Shannon Entropy of file:

# file_entropy.py
#
# Shannon Entropy of a file
# = minimum average number of bits per character
# required for encoding (compressing) the file
#
# So the theoretical limit (in bytes) for data compression:
# Shannon Entropy of the file * file size (in bytes) / 8
# (Assuming the file is a string of byte-size (UTF-8?) characters
# because if not then the Shannon Entropy value would be different.)
# FB - 201011291
import sys
import math
if len(sys.argv) != 2:
   
print "Usage: file_entropy.py [path]filename"
    sys
.exit()

# read the whole file into a byte array
f
= open(sys.argv[1], "rb")
byteArr
= map(ord, f.read())
f
.close()
fileSize
= len(byteArr)
print 'File size in bytes:'
print fileSize
print

# calculate the frequency of each byte value in the file
freqList
= []
for b in range(256):
    ctr
= 0
   
for byte in byteArr:
       
if byte == b:
            ctr
+= 1
    freqList
.append(float(ctr) / fileSize)
# print 'Frequencies of each byte-character:'
# print freqList
# print

# Shannon entropy
ent
= 0.0
for freq in freqList:
   
if freq > 0:
        ent
= ent + freq * math.log(freq, 2)
ent
= -ent
print 'Shannon entropy (min bits per byte-character):'
print ent
print
print 'Min possible file size assuming max theoretical compression efficiency:'
print (ent * fileSize), 'in bits'
print (ent * fileSize) / 8, 'in bytes'
FB36 (author) 13 years, 4 months ago  # | flag

Shannon Entropy calculation for the same input string using 2/4/8/16 bits/symbol:

# Shannon Entropy calculation for the same input string 
# using 2/4/8/16 bits/symbol
# FB - 201012083
import math
import random
import string

n
= 10 # arbitrary
strLen
= 16 * n
print 'Number of bits in the input string:', strLen
print
# generate strLen random bits as the input string
bits
= [random.randint(0, 1) for i in range(strLen)]
print 'Input string of random bits:'
print string.join([str(bit) for bit in bits])
print

for i in range(4):
    bitLen
= 2 ** (i + 1)
   
print 'Using', bitLen, 'bits/symbol:'
   
# generate frequency table for bitLen bits/symbol
    freqLen
= 2 ** bitLen # size of the frequency table
    freqList
= [0] * freqLen
    numSym
= strLen / bitLen
   
print 'Number of symbols in the input string:', numSym
   
for j in range(numSym):
        sym
= ''
       
for k in range(bitLen):
            sym
+= str(bits[bitLen * j + k])
       
# print 'Input symbol', j, ': ', sym
        freqList
[int(sym, 2)] += 1
   
print
   
for m in range(freqLen):
        freqList
[m] = float(freqList[m]) / numSym
   
# print 'Frequency table:'
   
# print freqList
   
# print

   
# Shannon Entropy
    ent
= 0.0
   
for freq in freqList:
       
if freq > 0:
            ent
= ent + freq * math.log(freq, 2)
    ent
= -ent
   
print 'Shannon Entropy (min number of bits needed to encode each symbol):'
   
print ent
   
print
   
print
FB36 (author) 13 years, 4 months ago  # | flag

If these lines added to the end of above code then it would also calculate min possible string size after max theoretical data compression:

    print 'Min possible string size after max theoretical data compression:'
   
print ent * numSym
   
print
   
print
FB36 (author) 13 years, 4 months ago  # | flag

Min possible string size is in bits.

FB36 (author) 13 years, 4 months ago  # | flag

As the above code would show, max possible compressed size of a data string depends on bits/symbol value chosen. So if a data compression algorithm, that allows for any bits/symbol value to be used, existed then Shannon entropy values for different bits/symbol could be used to choose bits/symbol value that produces the smallest compressed file size. (But of course this would also increase compression time a lot!)

FB36 (author) 13 years, 4 months ago  # | flag

Shannon Entropy of an image (calculated for red, green, blue separately):

# Shannon Entropy of an Image
# (min number of bits required to encode each color)
# FB - 201012131
import math
from PIL import Image
imageFile
= 'test.jpg'
im
= Image.open(imageFile)
rgbHistogram
= im.histogram()
print 'Snannon Entropy for Red, Green, Blue:'
for rgb in range(3):
    totalPixels
= sum(rgbHistogram[rgb * 256 : (rgb + 1) * 256])
    ent
= 0.0
   
for col in range(rgb * 256, (rgb + 1) * 256):
        freq
= float(rgbHistogram[col]) / totalPixels
       
if freq > 0:
            ent
= ent + freq * math.log(freq, 2)
    ent
= -ent
   
print ent
FB36 (author) 13 years, 4 months ago  # | flag

Shannon Entropy of a file (this code is more efficient then the previous one):

# Shannon Entropy of a file
# FB - 201012153
import sys
import math
if len(sys.argv) != 2:
   
print 'Usage: file_entropy.py [path]filename'
    sys
.exit()
f
= open(sys.argv[1], 'rb')
byteArr
= bytearray(f.read())
f
.close()
fileSize
= len(byteArr)
print 'File size in bytes:', fileSize
print

freqList
= [0] * 256
for b in byteArr:
    freqList
[b] += 1

ent
= 0.0
for f in freqList:
   
if f > 0:
        freq
= float(f) / fileSize
        ent
= ent + freq * math.log(freq, 2)
ent
= -ent
print 'Shannon entropy (min bits per byte-character):', ent
print
print 'Min possible file size assuming max theoretical compression efficiency:'
print (ent * fileSize) / 8, 'bytes'
Sam Khan 11 years, 2 months ago  # | flag

thank you :)

Brandon Lashmet 10 years, 10 months ago  # | flag

I'm trying to run the code in post "Shannon Entropy of a file (this code is more efficient then the previous one)", however, it gives a syntax error in Python.

What modifications do I need to make for it to scan a specific file? Sorry for the noobish question. Thanks! :)

FB36 (author) 10 years, 10 months ago  # | flag

It is designed to be run from (Windows/Linux) command prompt.

First install latest Python 2.x version.

Save that code as file_entropy.py file.

Add path of Python directory to your Windows/Linux path.

Open command prompt.

Type:

python [Path to file_entropy.py]file_entropy.py [Pathtoyourfiletobeanalyzed][Nameofthefiletobeanalyzed]

Or

file_entropy.py [Path to file_entropy.py][Pathtoyourfiletobeanalyzed][Nameofthefiletobeanalyzed].

I just run again and there is no syntax error.

(If you still get it please post me the exact error message you see.)

FB36 (author) 8 years, 8 months ago  # | flag
# Shannon Entropy of a file using 1/2/4/8/16 bits/symbol
# FB - 20150809
import math
import sys
import os

if len(sys.argv) != 2:
   
print 'Usage: file_entropy_N.py [path]filename'
    sys
.exit()
inputFileName
= sys.argv[1]
fileSize
= os.path.getsize(inputFileName)
f
= open(inputFileName, 'rb')
byteArr
= bytearray(f.read(fileSize))
f
.close()
fileSize
= len(byteArr)
print 'File size in bytes:', fileSize
print
strLen
= fileSize * 8 # file size in bits
for i in range(5):
    bitLen
= 2 ** i
   
print 'Using', bitLen, 'bits/symbol:'
   
# generate frequency table for bitLen bits/symbol
    freqLen
= 2 ** bitLen # size of the frequency table
    freqList
= [0] * freqLen
    numSym
= strLen / bitLen
   
print 'Number of symbols in the input string:', numSym
   
for j in range(numSym):
        sym
= ''
       
for k in range(bitLen):
            bitInd
= bitLen * j + k
            byteInd
= int(bitInd / 8)
           
if byteInd < fileSize:
                byteVal
= byteArr[byteInd]
           
else:
                byteVal
= 0
            bitInByteInd
= 7 - bitInd % 8
            bitVal
= int(byteVal / 2 ** bitInByteInd) % 2
            sym
+= str(bitVal)
        freqList
[int(sym, 2)] += 1
   
for m in range(freqLen):
        freqList
[m] = float(freqList[m]) / numSym
   
# print 'Frequency table:'
   
# print freqList
   
# print

    ent
= 0.0
    ctr
= 0
   
for freq in freqList:
       
if freq > 0:
            ent
= ent + freq * math.log(freq, 2)
            ctr
+= 1
    ent
= -ent
   
print 'Number of unique symbols in the input string:', ctr
   
print 'Shannon Entropy (min number of bits needed to encode each symbol):'
   
print ent
   
print 'Min possible file size assuming max theoretical data compression:'
   
print (ent * numSym) / 8, 'bytes'
   
print
Created by FB36 on Mon, 29 Nov 2010 (MIT)
Python recipes (4591)
FB36's recipes (148)

Required Modules

  • (none specified)

Other Information and Tasks