Python regular expression are very powerful and efficient and they can be applied to the recognition of different types of sequences. This recipe shows how to match sequences of generic symbol set with the power of regular expression. The code uses a mapping from every entity into a character. The mapping is used both at level of sequence and in the compilation of the regular expression. When the symbol set is small it is possible to efficiently use 8 bit strings instead of full unicode.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import re
class reop:
"""supporting class for representing re operators"""
def __init__(self,x):
self.value = x
def buildmap(inx):
"""builds the map from the symbol set above, every symbol as a unicode char"""
import itertools
from collections import defaultdict
d = defaultdict(itertools.count().next)
for x in inx:
d[x]
return d
def buildimap(inmap):
"""builds the inverse map"""
return dict([(y,x) for x,y in inmap.iteritems()])
def buildseq(inmap,inseq,xdef=None):
"""given a sequence and the mapping returns the encoding"""
if xdef is not None:
xdef = inmap[xdef]
if len(inmap) < 254:
r = "".join([chr(inmap.get(x,xdef)) for x in inseq])
else:
r = u"".join([unichr(inmap.get(x,xdef)) for x in inseq])
else:
if len(inmap)+len(inseq) < 254:
r = "".join([chr(inmap[x]) for x in inseq])
else:
r = u"".join([unichr(inmap[x]) for x in inseq])
return r
def compile(inmap,gregexp):
"""given a mapping dictionary and a generic regular expression returns it compiled"""
return re.compile(u"".join([isinstance(x,reop) and x.value or u"\\"+unichr(inmap[x]) for x in gregexp]))
def unmap(inmapr,encoded):
return [inmapr[ord(x)] for x in encoded]
if __name__ == "__main__":
x = ["hello","world","view","around","*"]
map1 = buildmap(x)
rex = compile(map1,("look",reop(".*?"),"world"))
es = buildseq(map1,"when I look the world what can I look at you in the world".split(" "))
print "encoded is ",es,len(es),type(es)
print unmap(map1,es)
print "go!"
map1i = buildimap(map1)
for m in rex.findall(es):
print unmap(map1i,m)
|