Recipe 101521 revision 2 « ActiveState Code

import string

#Translate in python has 2 pieces, a translation table and the translate call.
#The translation table is a list of 256 characters. Changing the order of the #characters is used for mapping

norm = string.maketrans('', '') #builds list of all characters
print len(norm) #256 characters

print string.maketrans('', '')[100] #is the letter d
print string.maketrans('', '')[101] #is the letter e
print string.maketrans('d','e')[100] #is now  also the letter e

#The second piece of translate, is the translate function itself.
#The translate function has 3 parts:

#1)string to translate
#2)translation table  -- always required
#3)deletion list

#Let's start simple and build 
#use translate to get groups of characters
#This can be done because translate's 3rd arg is to delete characters

#build list of all characters
norm = string.maketrans('', '') 

#delete letters
non_letters = string.translate(norm, norm, string.letters) 

#then take the list of non_letters and remove digits
non_alnum = string.translate(non_letters, all_chars, string.digits) 

#You'll notice the length shrinks appropriately as we delete
print len(all_chars),'\t256-(26*2 letters)=',len(non_letters),'\t204-10 digits=',len(non_alnum)

#Norm is a handy list to have around if all you are going to do is delete 
#characters. It would be nice if translate assumed Norm if the translation table arg was null.

#To translate all non-text to a '#', you have to have a one to one mapping for #each character in translate.
#Thus we make use of the python * operator to make a string of '#'
#of the appropriate length
trans_nontext=string.maketrans(non_alnum,'#'*len(non_alnum))

#A full program to examine strings in a binary file for Regents
# would look like this. We use regular expressions to convert all groups
# of '#' to a single '#'

import string,re

norm = string.maketrans('', '') #builds list of all characters
non_alnum = string.translate(norm, norm, string.letters+string.digits) 

#now examine the binary file. If Regents is in it. It contains the copyright
ftp_file=open('f:/tmp/ftp.exe','rb').read()

trans_nontext=string.maketrans(non_alnum,'#'*len(non_alnum))
cleaned=string.translate(ftp_file, trans_nontext)
for i in  re.sub('#+','#',cleaned).split('#'):
    if i.find('Regents')!=-1:
        print 'found it!',i
        break
    if i>5:
        print i
Recipe 101521 revision 2

History

Accounts

Code Recipes

Feedback & Information

ActiveState