Lexing C « Tcl recipes « ActiveState Code

Lexing C sources into a list of tokens. Demonstration of the string processing facilities of Tcl. The same approach will work for essentially all keyword based languages, for example Pascal, module, Eiffel, etc. <p> List and quoting based languages (Tcl itself, Lisp, ...) can be done too, but this is not optimal. <p> Author: Andreas Kupries <br> Original location: http://mini.net/tcl/3906<br>

       # -*- tcl -*-
 # Lexing C

 package provide clex 1.0
 namespace eval  clex {}

 # Most tokens can be recognized and separated from the others via
 # [string map].  The remainder are identifiers.  Except for strings,
 # and comments. These may contain any other token, and we may not act
 # upon them. And both may contain sequences reminiscent of the
 # other. Because of that a first step is to identify and parse out
 # comments and strings, where 'parsing out' means that these tokens
 # are replaced with special char sequences which refer the lexer to a
 # small database. In the final output they are placed back.

 proc clex::SCextract {code mapvar} {
     # Extract strings and comments from the code and place them in mapvar.
     # Replace them with ids

     upvar $mapvar map

     set tmp   ""
     set count 0

     while {1} {
 	set comment [string first "/*" $code]
 	set string  [string first "\"" $code]

 	if {($comment < 0) && ($string < 0)} {
 	    # No comments nor strings to extract anymore.
 	    break
 	}

 	# Debug out
 	#puts "$string | $comment | [string length $code]"

	if {
	    (($string >= 0) && ($comment >= 0) && ($string < $comment)) ||
	    (($string >= 0) && ($comment < 0))
	} {
	    # The next vari-sized thing is a "-quoted string.
	    # Finding its end is bit more difficult, because we have
	    # to accept \" as one character inside of the string.

	    set from $string
	    while 1 {
		incr from
		set stop  [string first "\""   $code $from]
		set stopb [string first "\\\"" $code $from]
		incr stopb
		if {$stop == $stopb} {set from $stopb ; incr from ; continue}
		break
	    }

	    set id \000@$count\000
	    incr count
	    lappend map $id [set sub [string range $code $string $stop]]

	    incr stop ; incr string -1
	    append tmp [string range $code 0 $string] $id
	    set  code  [string range $code $stop end]

	    # Debug out
	    #puts "\tSTR $id <$sub>"
	    continue
	}

	if {
	    (($string >= 0) && ($comment >= 0) && ($comment < $string)) ||
	    (($comment >= 0) && ($string < 0))
	} {
	    # The next vari-sized thing is a comment.
	    # We ignore comments.

	    set  stop [string first "*/" $code $comment]
	    incr stop 2
	    incr comment -1
	    append tmp [string range $code 0 $comment]
	    set  code  [string range $code $stop  end]
	    continue
	}
	return -code error "Panic, string and comment at some location"
    }
    append  tmp $code
    return $tmp
 }

 proc clex::DefStart {} {
    variable tokens [list]
    #puts "== <$tokens>"
    return
 }

 proc clex::Key {string {replacement {}}} {
    variable tokens
    if {$replacement == {}} {
	set replacement \000\001[string toupper $string]\000
    } else {
	set replacement \000\001$replacement\000
    }
    lappend tokens $string $replacement
    #puts "== <$tokens>"
    return
 }

 proc clex::DefEnd {} {
    variable       tokens
    array set tmp $tokens
    set res [list]
    foreach key [lsort -decreasing [array names tmp]] {
	lappend res $key $tmp($key)
    }
    set tokens $res
    #puts "== <$tokens>"
    return
 }

 proc clex::lex {code} {
    variable tokens

    # Phase I ... Extract strings and comments so that they don't interfere
    #             with the remaining phases.

    # Phase II ... Separate all constant-sized tokens (keywords and
    #              punctuation) from each other.

    # Phase III ... Separate whitespace from the useful text.
    #               Actually converts whitespace into separator characters.

    # Phase IV ... Reinsert extracted tokens and cleanup multi-separator sequences

    set scmap [list]
    if 0 {
	# Minimal number of commands for all phases

	regsub -all -- "\[\t\n \]+" [string map $tokens \
		[SCextract $code scmap]] \
		\000 tmp

	set code [split \
		[string trim \
		[string map "\000\000\000 \000 \000\000 \000" \
		[string map $scmap \
		$tmp]] \000] \000]
    }
    if 1 {
	# Each phase spelled out explicitly ...

	set code [SCextract          $code scmap]    ; # I
	set code [string map $tokens $code]          ; # II
	regsub -all -- "\[\t\n \]+"  $code \000 code ; # III
	set code [string map $scmap  $code]          ; # IV/a
	set code [string map "\000\000\000 \000 \000\000 \000" $code]  ; # IV/b
	set code [string trim $code \000]
	set code [split $code \000]
    }

    # Run through the list and create something useable by the parser.
    #
    # A list of pairs (pairs being lists of 2 elements), where each
    # pair contains the symbol to give to the parser, and associated
    # data, if any.

    set tmp [list]
    foreach lex $code {
	switch -glob -- [string index $lex 0] {
	    \001 {
		# Keyword, no data.
		lappend tmp [list [string range $lex 1 end] {}]
	    }
	    ' - [0-9] {
		# Character or numeric constant.
		lappend tmp [list CONSTANT $lex]
	    }
	    \" {
		# String literal. Strip the double-quotes.
		lappend tmp [list STRING_LITERAL [string range $lex 1 end-1]]
	    }
	    default {
		# Identifier. This code does not distinguish
		# identifiers and type-names yet. This is defered to
		# the 'scanner', i.e. the glue code feeding the lexer
		# symbols into the parser.

		lappend tmp [list IDENTIFIER $lex]
	    }
	}
    }
    set code $tmp

    return $code
 }

 namespace eval clex {
    DefStart

    Key (   LPAREN      ; Key )  RPAREN    ; Key ->  DEREF
    Key <   LT          ; Key <= LE        ; Key ==  EQ
    Key >   GT          ; Key >= GE        ; Key !=  NE
    Key \[  LBRACKET    ; Key \] RBRACKET  ; Key =   ASSIGN
    Key \{  LBRACE      ; Key \} RBRACE    ; Key *=  MUL_ASSIGN
    Key .   DOT         ; Key ,  COMMA     ; Key /=  DIV_ASSIGN
    Key ++  INCR_OP     ; Key -- DECR_OP   ; Key %=  REM_ASSIGN
    Key &   ADDR_BITAND ; Key *  MULT_STAR ; Key +=  PLUS_ASSIGN
    Key +   PLUS        ; Key -  MINUS     ; Key -=  MINUS_ASSIGN
    Key ~   BITNOT      ; Key !  LOGNOT    ; Key <<= LSHIFT_ASSIGN
    Key /   DIV         ; Key %  REM       ; Key >>= RSHIFT_ASSIGN
    Key <<  LSHIFT      ; Key >> RSHIFT    ; Key &=  BITAND_ASSIGN
    Key ^   BITEOR      ; Key && LOGAND    ; Key ^=  BITEOR_ASSIGN
    Key |   BITOR       ; Key || LOGOR     ; Key |=  BITOR_ASSIGN
    Key ?   QUERY       ; Key :  COLON     ; Key \;  SEMICOLON
    Key ... ELLIPSIS

    Key typedef ; Key extern   ; Key static ; Key auto ; Key register
    Key void    ; Key char     ; Key short  ; Key int  ; Key long
    Key float   ; Key double   ; Key signed ; Key unsigned
    Key goto    ; Key continue ; Key break  ; Key return
    Key case    ; Key default  ; Key switch
    Key struct  ; Key union    ; Key enum
    Key while   ; Key do       ; Key for
    Key const   ; Key volatile
    Key if      ; Key else
    Key sizeof

    DefEnd
 }

      

Tags: debugging

◄	Tcl recipes (162)	►
◄	andreas kupries's recipes (20)	►

Lexing C (Tcl recipe) by andreas kupries
ActiveState Code (http://code.activestate.com/recipes/146223/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Lexing C (Tcl recipe) by andreas kupries ActiveState Code (http://code.activestate.com/recipes/146223/)