import re
def tokeniser( text, tokenpat=None, blockchar='()[]{}' ):
'Lightweight text tokeniser for simple structured text'
defpat = r'''
(-?\d+\.?\d*)| # find -nn.nnn or -nnn or nn numbers
(\w+)| # look for words (identifiers) next
(".*?")| # look for double-quoted strings
('.*?')| # look for single quoted strings
([ \t]+)| # gather white space (but not new lines)
(\n)| # check for a new line character
(.) # capture any other text as single characters'''
openchar, closechar = blockchar[0::2], blockchar[1::2]
blockpair = dict( zip( closechar, openchar ) )
stack = []
block = []
synpat = re.compile( tokenpat or defpat, re.M + re.S + re.X )
for token in synpat.split( text ):
if token:
if token in openchar:
block.append( [] )
stack.append( block )
block = block[-1]
block.append( token )
if token in closechar:
assert block[0] == blockpair[ token ], 'Block end mismatch'
assert stack, 'Block start mismatch'
block = stack.pop()
assert stack == [], 'Block not closed'
return block
def showtokens( tokens, indent=0 ):
for token in tokens:
if type( token ) == list:
showtokens( token, indent+1 )
else:
print '%sToken: %s' % (' '*indent, `token`)
if __name__ == '__main__':
example = '''
for x in xseq[2:]:
print fn( x*-5.5, "it\'s big", "", {'g':[0]} )
end
'''.strip()
result = tokeniser( example )
showtokens( result )
Diff to Previous Revision
--- revision 1 2011-05-17 15:11:52
+++ revision 2 2011-05-18 13:04:01
@@ -2,12 +2,19 @@
def tokeniser( text, tokenpat=None, blockchar='()[]{}' ):
'Lightweight text tokeniser for simple structured text'
- defpat = r'''(-?\d+\.?\d*)|(\w+)|(".*?")|('.*?')|([ \t]+)|(\n)|(.)'''
+ defpat = r'''
+ (-?\d+\.?\d*)| # find -nn.nnn or -nnn or nn numbers
+ (\w+)| # look for words (identifiers) next
+ (".*?")| # look for double-quoted strings
+ ('.*?')| # look for single quoted strings
+ ([ \t]+)| # gather white space (but not new lines)
+ (\n)| # check for a new line character
+ (.) # capture any other text as single characters'''
openchar, closechar = blockchar[0::2], blockchar[1::2]
blockpair = dict( zip( closechar, openchar ) )
stack = []
block = []
- synpat = re.compile( tokenpat or defpat, re.M )
+ synpat = re.compile( tokenpat or defpat, re.M + re.S + re.X )
for token in synpat.split( text ):
if token:
if token in openchar:
@@ -22,7 +29,18 @@
assert stack == [], 'Block not closed'
return block
+def showtokens( tokens, indent=0 ):
+ for token in tokens:
+ if type( token ) == list:
+ showtokens( token, indent+1 )
+ else:
+ print '%sToken: %s' % (' '*indent, `token`)
+
if __name__ == '__main__':
- from pprint import pprint
- code='''for x in xseq[2:]:\n prt fn(x*-5.5,"it\'s","",{'g':[0]})\nend\n'''
- pprint( tokeniser( code ) )
+ example = '''
+for x in xseq[2:]:
+ print fn( x*-5.5, "it\'s big", "", {'g':[0]} )
+end
+ '''.strip()
+ result = tokeniser( example )
+ showtokens( result )