Lexing C sources into a list of tokens. Demonstration of the string processing facilities of Tcl. The same approach will work for essentially all keyword based languages, for example Pascal, module, Eiffel, etc. <p> List and quoting based languages (Tcl itself, Lisp, ...) can be done too, but this is not optimal. <p> Author: Andreas Kupries <br> Original location: http://mini.net/tcl/3906<br>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | # -*- tcl -*-
# Lexing C
package provide clex 1.0
namespace eval clex {}
# Most tokens can be recognized and separated from the others via
# [string map]. The remainder are identifiers. Except for strings,
# and comments. These may contain any other token, and we may not act
# upon them. And both may contain sequences reminiscent of the
# other. Because of that a first step is to identify and parse out
# comments and strings, where 'parsing out' means that these tokens
# are replaced with special char sequences which refer the lexer to a
# small database. In the final output they are placed back.
proc clex::SCextract {code mapvar} {
# Extract strings and comments from the code and place them in mapvar.
# Replace them with ids
upvar $mapvar map
set tmp ""
set count 0
while {1} {
set comment [string first "/*" $code]
set string [string first "\"" $code]
if {($comment < 0) && ($string < 0)} {
# No comments nor strings to extract anymore.
break
}
# Debug out
#puts "$string | $comment | [string length $code]"
if {
(($string >= 0) && ($comment >= 0) && ($string < $comment)) ||
(($string >= 0) && ($comment < 0))
} {
# The next vari-sized thing is a "-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \" as one character inside of the string.
set from $string
while 1 {
incr from
set stop [string first "\"" $code $from]
set stopb [string first "\\\"" $code $from]
incr stopb
if {$stop == $stopb} {set from $stopb ; incr from ; continue}
break
}
set id \000@$count\000
incr count
lappend map $id [set sub [string range $code $string $stop]]
incr stop ; incr string -1
append tmp [string range $code 0 $string] $id
set code [string range $code $stop end]
# Debug out
#puts "\tSTR $id <$sub>"
continue
}
if {
(($string >= 0) && ($comment >= 0) && ($comment < $string)) ||
(($comment >= 0) && ($string < 0))
} {
# The next vari-sized thing is a comment.
# We ignore comments.
set stop [string first "*/" $code $comment]
incr stop 2
incr comment -1
append tmp [string range $code 0 $comment]
set code [string range $code $stop end]
continue
}
return -code error "Panic, string and comment at some location"
}
append tmp $code
return $tmp
}
proc clex::DefStart {} {
variable tokens [list]
#puts "== <$tokens>"
return
}
proc clex::Key {string {replacement {}}} {
variable tokens
if {$replacement == {}} {
set replacement \000\001[string toupper $string]\000
} else {
set replacement \000\001$replacement\000
}
lappend tokens $string $replacement
#puts "== <$tokens>"
return
}
proc clex::DefEnd {} {
variable tokens
array set tmp $tokens
set res [list]
foreach key [lsort -decreasing [array names tmp]] {
lappend res $key $tmp($key)
}
set tokens $res
#puts "== <$tokens>"
return
}
proc clex::lex {code} {
variable tokens
# Phase I ... Extract strings and comments so that they don't interfere
# with the remaining phases.
# Phase II ... Separate all constant-sized tokens (keywords and
# punctuation) from each other.
# Phase III ... Separate whitespace from the useful text.
# Actually converts whitespace into separator characters.
# Phase IV ... Reinsert extracted tokens and cleanup multi-separator sequences
set scmap [list]
if 0 {
# Minimal number of commands for all phases
regsub -all -- "\[\t\n \]+" [string map $tokens \
[SCextract $code scmap]] \
\000 tmp
set code [split \
[string trim \
[string map "\000\000\000 \000 \000\000 \000" \
[string map $scmap \
$tmp]] \000] \000]
}
if 1 {
# Each phase spelled out explicitly ...
set code [SCextract $code scmap] ; # I
set code [string map $tokens $code] ; # II
regsub -all -- "\[\t\n \]+" $code \000 code ; # III
set code [string map $scmap $code] ; # IV/a
set code [string map "\000\000\000 \000 \000\000 \000" $code] ; # IV/b
set code [string trim $code \000]
set code [split $code \000]
}
# Run through the list and create something useable by the parser.
#
# A list of pairs (pairs being lists of 2 elements), where each
# pair contains the symbol to give to the parser, and associated
# data, if any.
set tmp [list]
foreach lex $code {
switch -glob -- [string index $lex 0] {
\001 {
# Keyword, no data.
lappend tmp [list [string range $lex 1 end] {}]
}
' - [0-9] {
# Character or numeric constant.
lappend tmp [list CONSTANT $lex]
}
\" {
# String literal. Strip the double-quotes.
lappend tmp [list STRING_LITERAL [string range $lex 1 end-1]]
}
default {
# Identifier. This code does not distinguish
# identifiers and type-names yet. This is defered to
# the 'scanner', i.e. the glue code feeding the lexer
# symbols into the parser.
lappend tmp [list IDENTIFIER $lex]
}
}
}
set code $tmp
return $code
}
namespace eval clex {
DefStart
Key ( LPAREN ; Key ) RPAREN ; Key -> DEREF
Key < LT ; Key <= LE ; Key == EQ
Key > GT ; Key >= GE ; Key != NE
Key \[ LBRACKET ; Key \] RBRACKET ; Key = ASSIGN
Key \{ LBRACE ; Key \} RBRACE ; Key *= MUL_ASSIGN
Key . DOT ; Key , COMMA ; Key /= DIV_ASSIGN
Key ++ INCR_OP ; Key -- DECR_OP ; Key %= REM_ASSIGN
Key & ADDR_BITAND ; Key * MULT_STAR ; Key += PLUS_ASSIGN
Key + PLUS ; Key - MINUS ; Key -= MINUS_ASSIGN
Key ~ BITNOT ; Key ! LOGNOT ; Key <<= LSHIFT_ASSIGN
Key / DIV ; Key % REM ; Key >>= RSHIFT_ASSIGN
Key << LSHIFT ; Key >> RSHIFT ; Key &= BITAND_ASSIGN
Key ^ BITEOR ; Key && LOGAND ; Key ^= BITEOR_ASSIGN
Key | BITOR ; Key || LOGOR ; Key |= BITOR_ASSIGN
Key ? QUERY ; Key : COLON ; Key \; SEMICOLON
Key ... ELLIPSIS
Key typedef ; Key extern ; Key static ; Key auto ; Key register
Key void ; Key char ; Key short ; Key int ; Key long
Key float ; Key double ; Key signed ; Key unsigned
Key goto ; Key continue ; Key break ; Key return
Key case ; Key default ; Key switch
Key struct ; Key union ; Key enum
Key while ; Key do ; Key for
Key const ; Key volatile
Key if ; Key else
Key sizeof
DefEnd
}
|
Tags: debugging