# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from src.syntax import Syntax, SyntaxLocation, SyntaxType # Checks whether a symbol is space def is_space(symbol): return symbol == " " or symbol == "\t" # Checks whether a symbol is a new line def is_newline(symbol): return symbol == "\n" # Checks whether a symbol is general whitespace def is_whitespace(symbol): return is_space(symbol) or is_newline(symbol) # Splits text in to a list of characters and whitespace def split_tokens(input): if input == "": return [] tokens = [] current = input[0] curr_whitespace = is_whitespace(input[0]) location = SyntaxLocation(1, 1, "") for c in input[1:]: c_whitespace = is_whitespace(c) if c_whitespace != curr_whitespace: # Flush current buffer and switch modes tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c curr_whitespace = c_whitespace elif curr_whitespace: # Whitespace mode appends each character tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c else: # Token mode builds the current buffer current += c tokens.append(Syntax(current, location, SyntaxType.TOKEN)) return tokens # Generates a list of tokens with locations def locate_tokens(tokens, filename): new_tokens = [] line = 1 column = 1 for t in tokens: location = SyntaxLocation(line, column, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) new_tokens.append(new) if is_newline(t.value): line = line + 1 column = 1 else: column += len(t.value) return new_tokens # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) located = locate_tokens(split, filename) return located