# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from src.syntax import Syntax, SyntaxLocation, SyntaxType # Valid space code points spaces = [ "\t", # U+0009 HORIZONTAL TAB " ", # U+0020 SPACE ] # Valid new line tokens newlines = [ "\n", # U+000A LINE FEED "\v", # U+000B VERTICAL TAB "\f", # U+000C FORM FEED "\r", # U+000D CARRIAGE RETURN "\u0085", # U+0085 NEXT LINE "\u2028", # U+2028 LINE SEPARATOR "\u2029", # U+2029 PARAGRAPH SEPARATOR ] # Checks whether a symbol is general whitespace def is_whitespace(symbol): return symbol in spaces or symbol in newlines # Splits text in to a list of tokens and whitespace def split_tokens(input): if input == "": return [] tokens = [] current = input[0] curr_whitespace = is_whitespace(input[0]) location = SyntaxLocation(1, 1, "") for c in input[1:]: c_whitespace = is_whitespace(c) if c_whitespace != curr_whitespace: # Flush current buffer and switch modes tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c curr_whitespace = c_whitespace elif curr_whitespace: # Whitespace mode appends each code point tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c else: # Token mode builds the current buffer current += c tokens.append(Syntax(current, location, SyntaxType.TOKEN)) return tokens # Generates a list of tokens with locations def locate_tokens(tokens, filename): new_tokens = [] line = 1 offset = 1 for t in tokens: location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) new_tokens.append(new) if t.value in newlines: line = line + 1 offset = 1 else: offset += len(t.value) return new_tokens # Removes whitespace tokens def strip_whitespace(syntax): output = [] for s in syntax: if s.type == SyntaxType.TOKEN and not is_whitespace(s.value): output.append(s) return output # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) located = locate_tokens(split, filename) stripped = strip_whitespace(located) return stripped