# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> import enum # Checks whether a symbol is space def is_space(symbol): return symbol == " " or symbol == "\t" # Checks whether a symbol is a new line def is_newline(symbol): return symbol == "\n" # Checks whether a symbol is general whitespace def is_whitespace(symbol): return is_space(symbol) or is_newline(symbol) # Location of a token class TokenLocation: def __init__(self, line, column, file): self.line = line self.column = column self.file = file def __repr__(self): return "TokenLocation(line %i, column %i, file '%s')" % ( # pragma: no mutate self.line, self.column, self.file, ) def __eq__(self, other): return ( self.line == other.line and self.column == other.column and self.file == other.file ) # The type of a token class TokenType(enum.Enum): UNKNOWN = enum.auto() # pragma: no mutate SPACE = enum.auto() # pragma: no mutate NEWLINE = enum.auto() # pragma: no mutate BOOL = enum.auto() # pragma: no mutate KEYWORD = enum.auto() # pragma: no mutate SHEBANG = enum.auto() # pragma: no mutate # Represents a tokenizer token class Token: def __init__(self, value, location, type): self.value = value self.location = location self.type = type def __repr__(self): return "Token(value %s, location %s, type %s)" % ( # pragma: no mutate repr(self.value), repr(self.location), str(self.type), ) def __eq__(self, other): return ( self.value == other.value and self.location == other.location and self.type == other.type ) # Splits text in to a list of characters and whitespace def split_tokens(input): if input == "": return [] tokens = [] current = input[0] curr_whitespace = is_whitespace(input[0]) location = TokenLocation(1, 1, "") type = TokenType.UNKNOWN for c in input[1:]: c_whitespace = is_whitespace(c) if c_whitespace != curr_whitespace: # Flush current buffer and switch modes tokens.append(Token(current, location, type)) current = c curr_whitespace = c_whitespace elif curr_whitespace: # Whitespace mode appends each character tokens.append(Token(current, location, type)) current = c else: # Token mode builds the current buffer current += c tokens.append(Token(current, location, type)) return tokens # Keywords recognized by the language keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", "StartNote", "EndNote", "StartText", "EndText", ] # Classifies tokens in to types def classify_tokens(tokens): new_tokens = [] for t in tokens: if is_newline(t.value): type = TokenType.NEWLINE elif is_space(t.value): type = TokenType.SPACE elif t.value in ["True", "False"]: type = TokenType.BOOL elif t.value in keywords: type = TokenType.KEYWORD elif t.value[0:2] == "#!": type = TokenType.SHEBANG else: type = TokenType.UNKNOWN new = Token(t.value, t.location, type) new_tokens.append(new) return new_tokens # Generates a list of tokens with locations def locate_tokens(tokens, filename): new_tokens = [] line = 1 column = 1 for t in tokens: location = TokenLocation(line, column, filename) new = Token(t.value, location, t.type) new_tokens.append(new) if t.type == TokenType.NEWLINE: line = line + 1 column = 1 else: column += len(t.value) return new_tokens # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) classified = classify_tokens(split) located = locate_tokens(classified, filename) return located