import re def tokeniser( text, tokenpat=None, blockchar='()[]{}' ): 'Lightweight text tokeniser for simple structured text' defpat = r'''(-?\d+\.?\d*)|(\w+)|(".*?")|('.*?')|([ \t]+)|(\n)|(.)''' openchar, closechar = blockchar[0::2], blockchar[1::2] blockpair = dict( zip( closechar, openchar ) ) stack = [] block = [] synpat = re.compile( tokenpat or defpat, re.M ) for token in synpat.split( text ): if token: if token in openchar: block.append( [] ) stack.append( block ) block = block[-1] block.append( token ) if token in closechar: assert block[0] == blockpair[ token ], 'Block end mismatch' assert stack, 'Block start mismatch' block = stack.pop() assert stack == [], 'Block not closed' return block if __name__ == '__main__': from pprint import pprint code='''for x in xseq[2:]:\n prt fn(x*-5.5,"it\'s","",{'g':[0]})\nend\n''' pprint( tokeniser( code ) )