Welcome, guest | Sign In | My Account | Store | Cart
import re

def tokeniser( text, tokenpat=None, blockchar='()[]{}' ):
	'Lightweight text tokeniser for simple structured text'
	defpat = r'''
		(-?\d+\.?\d*)|  # find -nn.nnn or -nnn or nn numbers
		(\w+)|          # look for words (identifiers) next
		(".*?")|        # look for double-quoted strings
		('.*?')|        # look for single quoted strings
		([ \t]+)|       # gather white space (but not new lines)
		(\n)|           # check for a new line character
		(.)             # capture any other text as single characters'''
	openchar, closechar = blockchar[0::2], blockchar[1::2]
	blockpair = dict( zip( closechar, openchar ) )
	stack = []
	block = []
	synpat = re.compile( tokenpat or defpat, re.M + re.S + re.X )
	for token in synpat.split( text ):
		if token:
			if token in openchar:
				block.append( [] )
				stack.append( block )
				block = block[-1]
			block.append( token )
			if token in closechar:
				assert block[0] == blockpair[ token ], 'Block end mismatch'
				assert stack, 'Block start mismatch'
				block = stack.pop()
	assert stack == [], 'Block not closed'
	return block

def showtokens( tokens, indent=0 ):
	for token in tokens:
		if type( token ) == list:
			showtokens( token, indent+1 )
		else:
			print '%sToken: %s' % ('    '*indent, `token`)

if __name__ == '__main__':
	example = '''
for x in xseq[2:]:
	print fn( x*-5.5, "it\'s big", "", {'g':[0]} )
end
	'''.strip()
	result = tokeniser( example )
	showtokens( result )

Diff to Previous Revision

--- revision 1 2011-05-17 15:11:52
+++ revision 2 2011-05-18 13:04:01
@@ -2,12 +2,19 @@
 
 def tokeniser( text, tokenpat=None, blockchar='()[]{}' ):
 	'Lightweight text tokeniser for simple structured text'
-	defpat = r'''(-?\d+\.?\d*)|(\w+)|(".*?")|('.*?')|([ \t]+)|(\n)|(.)'''
+	defpat = r'''
+		(-?\d+\.?\d*)|  # find -nn.nnn or -nnn or nn numbers
+		(\w+)|          # look for words (identifiers) next
+		(".*?")|        # look for double-quoted strings
+		('.*?')|        # look for single quoted strings
+		([ \t]+)|       # gather white space (but not new lines)
+		(\n)|           # check for a new line character
+		(.)             # capture any other text as single characters'''
 	openchar, closechar = blockchar[0::2], blockchar[1::2]
 	blockpair = dict( zip( closechar, openchar ) )
 	stack = []
 	block = []
-	synpat = re.compile( tokenpat or defpat, re.M )
+	synpat = re.compile( tokenpat or defpat, re.M + re.S + re.X )
 	for token in synpat.split( text ):
 		if token:
 			if token in openchar:
@@ -22,7 +29,18 @@
 	assert stack == [], 'Block not closed'
 	return block
 
+def showtokens( tokens, indent=0 ):
+	for token in tokens:
+		if type( token ) == list:
+			showtokens( token, indent+1 )
+		else:
+			print '%sToken: %s' % ('    '*indent, `token`)
+
 if __name__ == '__main__':
-	from pprint import pprint
-	code='''for x in xseq[2:]:\n prt fn(x*-5.5,"it\'s","",{'g':[0]})\nend\n'''
-	pprint( tokeniser( code ) )
+	example = '''
+for x in xseq[2:]:
+	print fn( x*-5.5, "it\'s big", "", {'g':[0]} )
+end
+	'''.strip()
+	result = tokeniser( example )
+	showtokens( result )

History