import re
class recollector:
def __init__(self):
self.res={}
def add(self, name, reg ):
re.compile(reg) # check that it is valid
self.res[name] = reg % self.res
collector = recollector()
a = collector.add
a("TextSE" , "[^<]+")
a("UntilHyphen" , "[^-]*-")
a("Until2Hyphens" , "%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-")
a("CommentCE" , "%(Until2Hyphens)s>?")
a("UntilRSBs" , "[^\\]]*](?:[^\\]]+])*]+")
a("CDATA_CE" , "%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" )
a("S" , "[ \\n\\t\\r]+")
a("NameStrt" , "[A-Za-z_:]|[^\\x00-\\x7F]")
a("NameChar" , "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]")
a("Name" , "(?:%(NameStrt)s)(?:%(NameChar)s)*")
a("QuoteSE" , "\"[^\"]*\"|'[^']*'")
a("DT_IdentSE" , "%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" )
a("MarkupDeclCE" , "(?:[^\\]\"'><]+|%(QuoteSE)s)*>" )
a("S1" , "[\\n\\r\\t ]")
a("UntilQMs" , "[^?]*\\?+")
a("PI_Tail" , "\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" )
a("DT_ItemSE" ,
"<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s"
)
a("DocTypeCE" ,
"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" )
a("DeclCE" ,
"--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|DOCTYPE(?:%(DocTypeCE)s)?")
a("PI_CE" , "%(Name)s(?:%(PI_Tail)s)?")
a("EndTagCE" , "%(Name)s(?:%(S)s)?>?")
a("AttValSE" , "\"[^<\"]*\"|'[^<']*'")
a("ElemTagCE" ,
"%(Name)s(?:%(S)s%(Name)s(?:%(S)s)?=(?:%(S)s)?(?:%(AttValSE)s))*(?:%(S)s)?/?>?")
a("MarkupSPE" ,
"<(?:!(?:%(DeclCE)s)?|\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)")
a("XML_SPE" , "%(TextSE)s|%(MarkupSPE)s")
a("XML_MARKUP_ONLY_SPE" , "%(MarkupSPE)s")
def lexxml(data, markuponly=0):
if markuponly:
reg = "XML_MARKUP_ONLY_SPE"
else:
reg = "XML_SPE"
regex = re.compile(collector.res[reg])
return regex.findall(data)
def assertlex(data, numtokens, markuponly=0):
tokens = lexxml(data, markuponly)
if len(tokens)!=numtokens:
assert len(lexxml(data))==numtokens, "data = '%s', numtokens = '%s'" %(data, numotkens)
if not markuponly:
assert "".join(tokens)==data
walktokens(tokens)
def walktokens(tokens):
print
for token in tokens:
if token.startswith("<"):
if token.startswith(""):
print "empty-tag:", token
elif token.endswith(">"):
print "start-tag:", token
else:
print "error:", token
else:
print "text:", token
def testlexer():
# this test suite could be larger!
assertlex("", 1)
assertlex("", 3)
assertlex("Blah", 3)
assertlex("Blah", 2, markuponly=1)
assertlex("Blah", 3, markuponly=1)
assertlex("Blah&foo;Blah", 3)
assertlex("Blah&foo;Blah", 2, markuponly=1)
assertlex("", 2)
assertlex("", 2)
assertlex("", 3)
if __name__=="__main__":
testlexer()