Welcome, guest | Sign In | My Account | Store | Cart
import re

class recollector:
   
def __init__(self):
       
self.res={}
   
def add(self, name, reg ):
        re
.compile(reg) # check that it is valid

       
self.res[name] = reg % self.res
       
collector
= recollector()
a
= collector.add

a
("TextSE" , "[^<]+")
a
("UntilHyphen" , "[^-]*-")
a
("Until2Hyphens" , "%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-")
a
("CommentCE" , "%(Until2Hyphens)s>?")
a
("UntilRSBs" , "[^\\]]*](?:[^\\]]+])*]+")
a
("CDATA_CE" , "%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" )
a
("S" , "[ \\n\\t\\r]+")
a
("NameStrt" , "[A-Za-z_:]|[^\\x00-\\x7F]")
a
("NameChar" , "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]")
a
("Name" , "(?:%(NameStrt)s)(?:%(NameChar)s)*")
a
("QuoteSE" , "\"[^\"]*\"|'[^']*'")
a
("DT_IdentSE" , "%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" )
a
("MarkupDeclCE" , "(?:[^\\]\"'><]+|%(QuoteSE)s)*>" )
a
("S1" , "[\\n\\r\\t ]")
a
("UntilQMs" , "[^?]*\\?+")
a
("PI_Tail" , "\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" )
a
("DT_ItemSE" ,
   
"<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s"
)
a
("DocTypeCE" ,
"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" )
a
("DeclCE" ,
   
"--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|DOCTYPE(?:%(DocTypeCE)s)?")
a
("PI_CE" , "%(Name)s(?:%(PI_Tail)s)?")
a
("EndTagCE" , "%(Name)s(?:%(S)s)?>?")
a
("AttValSE" , "\"[^<\"]*\"|'[^<']*'")
a
("ElemTagCE" ,
   
"%(Name)s(?:%(S)s%(Name)s(?:%(S)s)?=(?:%(S)s)?(?:%(AttValSE)s))*(?:%(S)s)?/?>?")

a
("MarkupSPE" ,
   
"<(?:!(?:%(DeclCE)s)?|\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)")
a
("XML_SPE" , "%(TextSE)s|%(MarkupSPE)s")
a
("XML_MARKUP_ONLY_SPE" , "%(MarkupSPE)s")


def lexxml(data, markuponly=0):
   
if markuponly:
        reg
= "XML_MARKUP_ONLY_SPE"
   
else:
        reg
= "XML_SPE"
    regex
= re.compile(collector.res[reg])
   
return regex.findall(data)

def assertlex(data, numtokens, markuponly=0):
    tokens
= lexxml(data, markuponly)
   
if len(tokens)!=numtokens:
       
assert len(lexxml(data))==numtokens,            "data = '%s', numtokens = '%s'" %(data, numotkens)
   
if not markuponly:
       
assert "".join(tokens)==data
    walktokens
(tokens)

def walktokens(tokens):
   
print
   
for token in tokens:
       
if token.startswith("<"):
           
if token.startswith("<!"):
               
print "declaration:", token
           
elif token.startswith("<?xml"):
               
print "xml declaration:", token
           
elif token.startswith("<?"):
               
print "processing instruction:", token
           
elif token.startswith("</"):
               
print "end-tag:", token
           
elif token.endswith("/>"):
               
print "empty-tag:", token
           
elif token.endswith(">"):
               
print "start-tag:", token
           
else:
               
print "error:", token
       
else:
           
print "text:", token

def testlexer():
   
# this test suite could be larger!
    assertlex
("<abc/>", 1)
    assertlex
("<abc><def/></abc>", 3)
    assertlex
("<abc>Blah</abc>", 3)
    assertlex
("<abc>Blah</abc>", 2, markuponly=1)
    assertlex
("<?xml version='1.0'?><abc>Blah</abc>", 3, markuponly=1)
    assertlex
("<abc>Blah&foo;Blah</abc>", 3)
    assertlex
("<abc>Blah&foo;Blah</abc>", 2, markuponly=1)
    assertlex
("<abc><abc>", 2)
    assertlex
("</abc></abc>", 2)
    assertlex
("<abc></def></abc>", 3)

if __name__=="__main__":
    testlexer
()

History

  • revision 2 (22 years ago)
  • previous revisions are not available