# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from src.syntax import Syntax, SyntaxLocation # Valid space code points spaces = [ "\t", # U+0009 HORIZONTAL TAB " ", # U+0020 SPACE ] # Valid new line tokens newlines = [ "\n", # U+000A LINE FEED "\v", # U+000B VERTICAL TAB "\f", # U+000C FORM FEED "\r", # U+000D CARRIAGE RETURN "\r\n", # U+000A U+000D CARRIAGE RETURN then LINE FEED "\u0085", # U+0085 NEXT LINE "\u2028", # U+2028 LINE SEPARATOR "\u2029", # U+2029 PARAGRAPH SEPARATOR ] # Checks whether a symbol is general whitespace def is_whitespace(symbol): return symbol in spaces or symbol in newlines # Splits text in to a list of tokens and whitespace def split_tokens(input): if input == "": return [] tokens = [] prev = input[0] buffer = prev location = SyntaxLocation(1, 1, "") for curr in input[1:]: curr_space = is_whitespace(curr) prev_space = is_whitespace(prev) switching = curr_space != prev_space crlf = prev == "\r" and curr == "\n" # Flush if we switch between whitespace and non-whitespace code points # Flush if we're working with a stream of whitespace # Don't flush if we're in the middle of a CR LF sequence flush = switching or (curr_space and not crlf) if flush: tokens.append(Syntax(buffer, location)) buffer = "" buffer += curr prev = curr tokens.append(Syntax(buffer, location)) return tokens # Generates a list of tokens with locations def locate_tokens(tokens, filename): new_tokens = [] line = 1 offset = 1 for t in tokens: location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location) new_tokens.append(new) if t.value in newlines: line = line + 1 offset = 1 else: offset += len(t.value) return new_tokens # Removes whitespace tokens def strip_whitespace(syntax): output = [] for s in syntax: if not is_whitespace(s.value): output.append(s) return output # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) located = locate_tokens(split, filename) stripped = strip_whitespace(located) return stripped