import sys import os EOF = "" def get_num(): digits = [] t = getchar() while t.isdigit(): digits.append(t) t = getchar() if t != EOF: ungetc(t) num = int("".join(digits)) return num def get_word(): chars = [] t = getchar() while t.isalnum(): chars.append(t) t = getchar() if t != EOF: ungetc(t) s = "".join(chars) return s class DragonException(Exception): pass class Tag(object): NONE = -1 NUM = 256 DIV = 257 MOD = 258 ID = 259 DONE = 260 class Token(object): def __init__(self, t, value=''): self.tag = t self.value = value def __str__(self): return "Token([%s] [%s])" % (str(self.tag), str(self.value)) class Num(Token): def __init__(self, v): Token.__init__(self, Tag.NUM, v) class Word(Token): def __init__(self, tag, lexeme): Token.__init__(self, tag, lexeme) class Lexer(object): def __init__(self): self.lineno = 0 self.lookahead = None self.out = [] self.sym_table = {} # symbol table: lexeme -> token tag self.insert("div", Tag.DIV) self.insert("mod", Tag.MOD) def lookup(self, key): return self.sym_table.get(key, None) def insert(self, key, tag): self.sym_table[key] = tag def parse(self): self.lookahead = self.lexan() while self.lookahead.tag != Tag.DONE: self.expr() self.match(';') return self def lexan(self): token = None while not token: t = getchar() if t == ' ' or t == '\t': pass # strip out white space elif t == '\n': self.lineno += 1 elif t.isdigit(): ungetc(t) num = get_num() token = Num(num) elif t.isalpha(): ungetc(t) word = get_word() tag = self.lookup(word) if not tag: tag = Tag.ID self.insert(word, tag) token = Word(tag, word) elif t == EOF: token = Token(Tag.DONE) else: token = Token(t) return token def expr(self): self.term() while True: token = self.lookahead if token.tag == '+' \ or token.tag == '-': self.match(token.tag) self.term() self.emit(token) else: break def term(self): self.factor() while True: token = self.lookahead if token.tag == '*' \ or token.tag == '/' \ or token.tag == Tag.DIV \ or token.tag == Tag.MOD: self.match(token.tag) self.factor() self.emit(token) else: break def factor(self): token = self.lookahead if token.tag == '(': self.match('(') self.expr() self.match(')') elif token.tag == Tag.NUM: self.emit(token) self.match(Tag.NUM) elif token.tag == Tag.ID: self.emit(token) self.match(Tag.ID) else: raise DragonException("factor: bad token tag - %s" % token) def match(self, tag): if self.lookahead.tag == tag: self.lookahead = self.lexan() else: raise DragonException("match: tag %s doesn't match %s" % (tag, self.lookahead)) def emit(self, token): tag = token.tag value = token.value if tag == '+' or tag == '-' or tag == '*' or tag == '/': out = "%s" % tag elif tag == Tag.DIV: out = "DIV" elif tag == Tag.MOD: out = "MOD" elif tag == Tag.NUM: out = "%d" % value elif tag == Tag.ID: out = "%s" % value else: out = "emit: %s" % (token) self.out.append(out) print out def db(self, intro=""): print "%s %s" % (intro, self.lookahead) def main(): lex = Lexer().parse() print lex.out # python specific import StringIO FP = None def getchar(): if FP: return FP.read(1) def ungetc(c=''): if FP: FP.seek(-1, os.SEEK_CUR) SRC = """1 + (2 + 3); 4 * (5 + 6); 7 div (8 - 9); 10 mod (11 + 12); """ if __name__ == "__main__": FP = StringIO.StringIO(SRC) main()