import re class reop: """supporting class for representing re operators""" def __init__(self,x): self.value = x def buildmap(inx): """builds the map from the symbol set above, every symbol as a unicode char""" import itertools from collections import defaultdict d = defaultdict(itertools.count().next) for x in inx: d[x] return d def buildimap(inmap): """builds the inverse map""" return dict([(y,x) for x,y in inmap.iteritems()]) def buildseq(inmap,inseq,xdef=None): """given a sequence and the mapping returns the encoding""" if xdef is not None: xdef = inmap[xdef] if len(inmap) < 254: r = "".join([chr(inmap.get(x,xdef)) for x in inseq]) else: r = u"".join([unichr(inmap.get(x,xdef)) for x in inseq]) else: if len(inmap)+len(inseq) < 254: r = "".join([chr(inmap[x]) for x in inseq]) else: r = u"".join([unichr(inmap[x]) for x in inseq]) return r def compile(inmap,gregexp): """given a mapping dictionary and a generic regular expression returns it compiled""" return re.compile(u"".join([isinstance(x,reop) and x.value or u"\\"+unichr(inmap[x]) for x in gregexp])) def unmap(inmapr,encoded): return [inmapr[ord(x)] for x in encoded] if __name__ == "__main__": x = ["hello","world","view","around","*"] map1 = buildmap(x) rex = compile(map1,("look",reop(".*?"),"world")) es = buildseq(map1,"when I look the world what can I look at you in the world".split(" ")) print "encoded is ",es,len(es),type(es) print unmap(map1,es) print "go!" map1i = buildimap(map1) for m in rex.findall(es): print unmap(map1i,m)