Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

# Checks whether a symbol is whitespace
def is_whitespace(symbol):
    return symbol == " " or symbol == "\t" or symbol == "\n"


# Location of a symbol
class SymbolLocation:
    def __init__(self, line, column, file):
        self.line = line
        self.column = column
        self.file = file

    def __repr__(self):
        return "SymbolLocation(line %i, column %i, file '%s')" % (  # pragma: no mutate
            self.line,
            self.column,
            self.file,
        )

    def __eq__(self, other):
        return (
            self.line == other.line
            and self.column == other.column
            and self.file == other.file
        )


# Represents a tokenizer symbol
class Symbol:
    def __init__(self, value, location):
        self.value = value
        self.location = location

    def __repr__(self):
        return "Symbol(value %s, location %s)" % (  # pragma: no mutate
            repr(self.value),
            repr(self.location),
        )

    def __eq__(self, other):
        return self.value == other.value and self.location == other.location


# Splits text in to a list of characters and whitespace
def split_symbols(input):
    if input == "":
        return []
    symbols = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    location = SymbolLocation(1, 1, "")
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            symbols.append(Symbol(current, location))
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each character
            symbols.append(Symbol(current, location))
            current = c
        else:
            # Symbol mode builds the current buffer
            current += c
    symbols.append(Symbol(current, location))
    return symbols


# Generates a list of symbols with locations
def locate_symbols(symbols, filename):
    new_symbols = []
    line = 1
    column = 1
    for s in symbols:
        location = SymbolLocation(line, column, filename)
        new = Symbol(s.value, location)
        new_symbols.append(new)
        if s.value == "\n":
            line = line + 1
            column = 1
        else:
            column += len(s.value)
    return new_symbols