import string
#Translate in python has 2 pieces, a translation table and the translate call.
#The translation table is a list of 256 characters. Changing the order of the #characters is used for mapping
norm = string.maketrans('', '') #builds list of all characters
print len(norm) #256 characters
print string.maketrans('', '')[100] #is the letter d
print string.maketrans('', '')[101] #is the letter e
print string.maketrans('d','e')[100] #is now also the letter e
#The second piece of translate, is the translate function itself.
#The translate function has 3 parts:
#1)string to translate
#2)translation table -- always required
#3)deletion list
#Let's start simple and build
#use translate to get groups of characters
#This can be done because translate's 3rd arg is to delete characters
#build list of all characters
norm = string.maketrans('', '')
#delete letters
non_letters = string.translate(norm, norm, string.letters)
#then take the list of non_letters and remove digits
non_alnum = string.translate(non_letters, all_chars, string.digits)
#You'll notice the length shrinks appropriately as we delete
print len(all_chars),'\t256-(26*2 letters)=',len(non_letters),'\t204-10 digits=',len(non_alnum)
#Norm is a handy list to have around if all you are going to do is delete
#characters. It would be nice if translate assumed Norm if the translation table arg was null.
#To translate all non-text to a '#', you have to have a one to one mapping for #each character in translate.
#Thus we make use of the python * operator to make a string of '#'
#of the appropriate length
trans_nontext=string.maketrans(non_alnum,'#'*len(non_alnum))
#A full program to examine strings in a binary file for Regents
# would look like this. We use regular expressions to convert all groups
# of '#' to a single '#'
import string,re
norm = string.maketrans('', '') #builds list of all characters
non_alnum = string.translate(norm, norm, string.letters+string.digits)
#now examine the binary file. If Regents is in it. It contains the copyright
ftp_file=open('f:/tmp/ftp.exe','rb').read()
trans_nontext=string.maketrans(non_alnum,'#'*len(non_alnum))
cleaned=string.translate(ftp_file, trans_nontext)
for i in re.sub('#+','#',cleaned).split('#'):
if i.find('Regents')!=-1:
print 'found it!',i
break
if i>5:
print i