# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> import enum # The type of syntax class SyntaxType(enum.Enum): TOKEN = enum.auto() # pragma: no mutate TEXT = enum.auto() # pragma: no mutate # Represents a syntax node class Syntax: def __init__(self, value, location, type): self.value = value self.location = location self.type = type def __repr__(self): return "Syntax(value %s, location %s, type %s)" % ( # pragma: no mutate repr(self.value), repr(self.location), str(self.type), ) def __eq__(self, other): return ( self.type == other.type and self.value == other.value and self.location == other.location ) # Location of a syntax node class SyntaxLocation: def __init__(self, line, column, file): self.line = line self.column = column self.file = file def __repr__(self): return "SyntaxLocation(line %i, column %i, file '%s')" % ( # pragma: no mutate self.line, self.column, self.file, ) def __eq__(self, other): return ( self.line == other.line and self.column == other.column and self.file == other.file ) # Checks whether a symbol is space def is_space(symbol): return symbol == " " or symbol == "\t" # Checks whether a symbol is a new line def is_newline(symbol): return symbol == "\n" # Checks whether a symbol is general whitespace def is_whitespace(symbol): return is_space(symbol) or is_newline(symbol) # Splits text in to a list of characters and whitespace def split_tokens(input): if input == "": return [] tokens = [] current = input[0] curr_whitespace = is_whitespace(input[0]) location = SyntaxLocation(1, 1, "") for c in input[1:]: c_whitespace = is_whitespace(c) if c_whitespace != curr_whitespace: # Flush current buffer and switch modes tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c curr_whitespace = c_whitespace elif curr_whitespace: # Whitespace mode appends each character tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c else: # Token mode builds the current buffer current += c tokens.append(Syntax(current, location, SyntaxType.TOKEN)) return tokens # Keywords recognized by the language keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", "StartNote", "EndNote", "StartText", "EndText", ] # Generates a list of tokens with locations def locate_tokens(tokens, filename): new_tokens = [] line = 1 column = 1 for t in tokens: location = SyntaxLocation(line, column, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) new_tokens.append(new) if is_newline(t.value): line = line + 1 column = 1 else: column += len(t.value) return new_tokens # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) located = locate_tokens(split, filename) return located