|
|
Lexing C sources into a list of tokens. Demonstration of the string processing facilities of Tcl. The same approach will work for essentially all keyword based languages, for example Pascal, module, Eiffel, etc.
<p>
List and quoting based languages (Tcl itself, Lisp, ...) can be done too, but this is not optimal.
<p>
Author: Andreas Kupries <br>
Original location: http://mini.net/tcl/3906<br>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226 | # -*- tcl -*-
# Lexing C
package provide clex 1.0
namespace eval clex {}
# Most tokens can be recognized and separated from the others via
# [string map]. The remainder are identifiers. Except for strings,
# and comments. These may contain any other token, and we may not act
# upon them. And both may contain sequences reminiscent of the
# other. Because of that a first step is to identify and parse out
# comments and strings, where 'parsing out' means that these tokens
# are replaced with special char sequences which refer the lexer to a
# small database. In the final output they are placed back.
proc clex::SCextract {code mapvar} {
# Extract strings and comments from the code and place them in mapvar.
# Replace them with ids
upvar $mapvar map
set tmp ""
set count 0
while {1} {
set comment [string first "/*" $code]
set string [string first "\"" $code]
if {($comment < 0) && ($string < 0)} {
# No comments nor strings to extract anymore.
break
}
# Debug out
#puts "$string | $comment | [string length $code]"
if {
(($string >= 0) && ($comment >= 0) && ($string < $comment)) ||
(($string >= 0) && ($comment < 0))
} {
# The next vari-sized thing is a "-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \" as one character inside of the string.
set from $string
while 1 {
incr from
set stop [string first "\"" $code $from]
set stopb [string first "\\\"" $code $from]
incr stopb
if {$stop == $stopb} {set from $stopb ; incr from ; continue}
break
}
set id \000@$count\000
incr count
lappend map $id [set sub [string range $code $string $stop]]
incr stop ; incr string -1
append tmp [string range $code 0 $string] $id
set code [string range $code $stop end]
# Debug out
#puts "\tSTR $id <$sub>"
continue
}
if {
(($string >= 0) && ($comment >= 0) && ($comment < $string)) ||
(($comment >= 0) && ($string < 0))
} {
# The next vari-sized thing is a comment.
# We ignore comments.
set stop [string first "*/" $code $comment]
incr stop 2
incr comment -1
append tmp [string range $code 0 $comment]
set code [string range $code $stop end]
continue
}
return -code error "Panic, string and comment at some location"
}
append tmp $code
return $tmp
}
proc clex::DefStart {} {
variable tokens [list]
#puts "== <$tokens>"
return
}
proc clex::Key {string {replacement {}}} {
variable tokens
if {$replacement == {}} {
set replacement \000\001[string toupper $string]\000
} else {
set replacement \000\001$replacement\000
}
lappend tokens $string $replacement
#puts "== <$tokens>"
return
}
proc clex::DefEnd {} {
variable tokens
array set tmp $tokens
set res [list]
foreach key [lsort -decreasing [array names tmp]] {
lappend res $key $tmp($key)
}
set tokens $res
#puts "== <$tokens>"
return
}
proc clex::lex {code} {
variable tokens
# Phase I ... Extract strings and comments so that they don't interfere
# with the remaining phases.
# Phase II ... Separate all constant-sized tokens (keywords and
# punctuation) from each other.
# Phase III ... Separate whitespace from the useful text.
# Actually converts whitespace into separator characters.
# Phase IV ... Reinsert extracted tokens and cleanup multi-separator sequences
set scmap [list]
if 0 {
# Minimal number of commands for all phases
regsub -all -- "\[\t\n \]+" [string map $tokens \
[SCextract $code scmap]] \
\000 tmp
set code [split \
[string trim \
[string map "\000\000\000 \000 \000\000 \000" \
[string map $scmap \
$tmp]] \000] \000]
}
if 1 {
# Each phase spelled out explicitly ...
set code [SCextract $code scmap] ; # I
set code [string map $tokens $code] ; # II
regsub -all -- "\[\t\n \]+" $code \000 code ; # III
set code [string map $scmap $code] ; # IV/a
set code [string map "\000\000\000 \000 \000\000 \000" $code] ; # IV/b
set code [string trim $code \000]
set code [split $code \000]
}
# Run through the list and create something useable by the parser.
#
# A list of pairs (pairs being lists of 2 elements), where each
# pair contains the symbol to give to the parser, and associated
# data, if any.
set tmp [list]
foreach lex $code {
switch -glob -- [string index $lex 0] {
\001 {
# Keyword, no data.
lappend tmp [list [string range $lex 1 end] {}]
}
' - [0-9] {
# Character or numeric constant.
lappend tmp [list CONSTANT $lex]
}
\" {
# String literal. Strip the double-quotes.
lappend tmp [list STRING_LITERAL [string range $lex 1 end-1]]
}
default {
# Identifier. This code does not distinguish
# identifiers and type-names yet. This is defered to
# the 'scanner', i.e. the glue code feeding the lexer
# symbols into the parser.
lappend tmp [list IDENTIFIER $lex]
}
}
}
set code $tmp
return $code
}
namespace eval clex {
DefStart
Key ( LPAREN ; Key ) RPAREN ; Key -> DEREF
Key < LT ; Key <= LE ; Key == EQ
Key > GT ; Key >= GE ; Key != NE
Key \[ LBRACKET ; Key \] RBRACKET ; Key = ASSIGN
Key \{ LBRACE ; Key \} RBRACE ; Key *= MUL_ASSIGN
Key . DOT ; Key , COMMA ; Key /= DIV_ASSIGN
Key ++ INCR_OP ; Key -- DECR_OP ; Key %= REM_ASSIGN
Key & ADDR_BITAND ; Key * MULT_STAR ; Key += PLUS_ASSIGN
Key + PLUS ; Key - MINUS ; Key -= MINUS_ASSIGN
Key ~ BITNOT ; Key ! LOGNOT ; Key <<= LSHIFT_ASSIGN
Key / DIV ; Key % REM ; Key >>= RSHIFT_ASSIGN
Key << LSHIFT ; Key >> RSHIFT ; Key &= BITAND_ASSIGN
Key ^ BITEOR ; Key && LOGAND ; Key ^= BITEOR_ASSIGN
Key | BITOR ; Key || LOGOR ; Key |= BITOR_ASSIGN
Key ? QUERY ; Key : COLON ; Key \; SEMICOLON
Key ... ELLIPSIS
Key typedef ; Key extern ; Key static ; Key auto ; Key register
Key void ; Key char ; Key short ; Key int ; Key long
Key float ; Key double ; Key signed ; Key unsigned
Key goto ; Key continue ; Key break ; Key return
Key case ; Key default ; Key switch
Key struct ; Key union ; Key enum
Key while ; Key do ; Key for
Key const ; Key volatile
Key if ; Key else
Key sizeof
DefEnd
}
|
|