Welcome, guest | Sign In | My Account | Store | Cart

Recipe 577700 revision 1

import re

def tokeniser( text, tokenpat=None, blockchar='()[]{}' ):
	'Lightweight text tokeniser for simple structured text'
	defpat = r'''(-?\d+\.?\d*)|(\w+)|(".*?")|('.*?')|([ \t]+)|(\n)|(.)'''
	openchar, closechar = blockchar[0::2], blockchar[1::2]
	blockpair = dict( zip( closechar, openchar ) )
	stack = []
	block = []
	synpat = re.compile( tokenpat or defpat, re.M )
	for token in synpat.split( text ):
		if token:
			if token in openchar:
				block.append( [] )
				stack.append( block )
				block = block[-1]
			block.append( token )
			if token in closechar:
				assert block[0] == blockpair[ token ], 'Block end mismatch'
				assert stack, 'Block start mismatch'
				block = stack.pop()
	assert stack == [], 'Block not closed'
	return block

if __name__ == '__main__':
	from pprint import pprint
	code='''for x in xseq[2:]:\n prt fn(x*-5.5,"it\'s","",{'g':[0]})\nend\n'''
	pprint( tokeniser( code ) )

« Back to Recipe 577700

History

revision 2 (12 years ago)
revision 1 (12 years ago)

Accounts

Code Recipes

Feedback & Information

ActiveState

© 2024 ActiveState Software Inc. All rights reserved. ActiveState®, Komodo®, ActiveState Perl Dev Kit®, ActiveState Tcl Dev Kit®, ActivePerl®, ActivePython®, and ActiveTcl® are registered trademarks of ActiveState. All other marks are property of their respective owners.