Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

import enum


# The type of syntax
class SyntaxType(enum.Enum):
    TOKEN = enum.auto()  # pragma: no mutate
    TEXT = enum.auto()  # pragma: no mutate


# Represents a syntax node
class Syntax:
    def __init__(self, value, location, type):
        self.value = value
        self.location = location
        self.type = type

    def __repr__(self):
        return "Syntax(value %s, location %s, type %s)" % (  # pragma: no mutate
            repr(self.value),
            repr(self.location),
            str(self.type),
        )

    def __eq__(self, other):
        return (
            self.type == other.type
            and self.value == other.value
            and self.location == other.location
        )


# Location of a syntax node
class SyntaxLocation:
    def __init__(self, line, column, file):
        self.line = line
        self.column = column
        self.file = file

    def __repr__(self):
        return "SyntaxLocation(line %i, column %i, file '%s')" % (  # pragma: no mutate
            self.line,
            self.column,
            self.file,
        )

    def __eq__(self, other):
        return (
            self.line == other.line
            and self.column == other.column
            and self.file == other.file
        )


# Checks whether a symbol is space
def is_space(symbol):
    return symbol == " " or symbol == "\t"


# Checks whether a symbol is a new line
def is_newline(symbol):
    return symbol == "\n"


# Checks whether a symbol is general whitespace
def is_whitespace(symbol):
    return is_space(symbol) or is_newline(symbol)


# Splits text in to a list of characters and whitespace
def split_tokens(input):
    if input == "":
        return []
    tokens = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    location = SyntaxLocation(1, 1, "")
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each character
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
        else:
            # Token mode builds the current buffer
            current += c
    tokens.append(Syntax(current, location, SyntaxType.TOKEN))
    return tokens


# Keywords recognized by the language
keywords = [
    "NewLang",
    "Done",
    "Set",
    "To",
    "EndSet",
    "If",
    "Then",
    "Else",
    "EndIf",
    "StartNote",
    "EndNote",
    "StartText",
    "EndText",
]


# Generates a list of tokens with locations
def locate_tokens(tokens, filename):
    new_tokens = []
    line = 1
    column = 1
    for t in tokens:
        location = SyntaxLocation(line, column, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        new_tokens.append(new)
        if is_newline(t.value):
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return new_tokens


# Tokenizes source code
def tokenize(source, filename):
    split = split_tokens(source)
    located = locate_tokens(split, filename)
    return located