diff --git a/parse.py b/parse.py index 0e84ec2..1f80f54 100644 --- a/parse.py +++ b/parse.py @@ -11,6 +11,9 @@ def __repr__(self): return "Token(type %s, value '%s')" % (self.type, self.value) +def is_whitespace(symbol): + return symbol == " " or symbol == "\t" or symbol == "\n" + class Tokenizer: def __init__(self, input): self.code = input @@ -29,52 +32,59 @@ self.pos += 1 return symbol + def read_token(self): + token = "" + symbol = self.next() + if not symbol: + log.log(log.LEXER, log.LEXER, "No token to read") + return None + while not is_whitespace(symbol): + token += symbol + symbol = self.next() + log.log(log.LEXER, log.LEXER, "Read token '%s'" % (token)) + return token + def tokenize(self): tokens = [] - token = "" text = "" mode = "normal" # normal/note/text - symbol = self.next() - while symbol: - if symbol == " " or symbol == "\t" or symbol == "\n": - if token == "": - pass - elif token == "BeginNote": - log.log(log.LEXER, log.TRACE, "Switching to note mode") - mode = "note" - elif token == "EndNote": - log.log(log.LEXER, log.TRACE, "Ending note mode") - mode = "normal" - elif token == "BeginText": - log.log(log.LEXER, log.TRACE, "Switching to text mode") - mode = "text" - elif token == "EndText": - log.log(log.LEXER, log.TRACE, "Ending text mode") - content = text[1:-8] - log.log(log.LEXER, log.DEBUG, "Appending text '%s'" % (content)) - tokens.append(Token("text", content)) - mode = "normal" - text = "" - elif token != "": - if mode == "normal": - keywords = ["NewLang", "Done", "Set", "To", "EndSet", - "If", "Then", "Else", "EndIf"] - if token in keywords: - type = "keyword" - token = token.lower() - else: - type = "symbol" - tok = Token(type, token) - log.log(log.LEXER, log.DEBUG, "Appending %s" % (tok)) - tokens.append(tok) + token = self.read_token() + while token != None: + if token == "": + pass + elif token == "BeginNote": + log.log(log.LEXER, log.TRACE, "Switching to note mode") + mode = "note" + elif token == "EndNote": + log.log(log.LEXER, log.TRACE, "Ending note mode") + mode = "normal" + elif token == "BeginText": + log.log(log.LEXER, log.TRACE, "Switching to text mode") + mode = "text" + elif token == "EndText": + log.log(log.LEXER, log.TRACE, "Ending text mode") + content = text[10:-1] + log.log(log.LEXER, log.DEBUG, "Appending text '%s'" % (content)) + tokens.append(Token("text", content)) + mode = "normal" + text = "" + elif token != "": + if mode == "normal": + keywords = ["NewLang", "Done", "Set", "To", "EndSet", + "If", "Then", "Else", "EndIf"] + if token in keywords: + type = "keyword" + token = token.lower() else: - log.log(log.LEXER, log.TRACE, "Skipping token '%s'" % (token)) - token = "" - else: - token += symbol + type = "symbol" + tok = Token(type, token) + log.log(log.LEXER, log.DEBUG, "Appending %s" % (tok)) + tokens.append(tok) + else: + log.log(log.LEXER, log.TRACE, "Skipping token '%s'" % (token)) if mode == "text": - text += symbol - symbol = self.next() + text += token + " " + token = self.read_token() log.log(log.LEXER, log.TRACE, "Done lexing, adding EOF") tokens.append(Token("EOF", None)) return tokens