Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

import enum


# Checks whether a symbol is space
def is_space(symbol):
    return symbol == " " or symbol == "\t"


# Checks whether a symbol is a new line
def is_newline(symbol):
    return symbol == "\n"


# Checks whether a symbol is general whitespace
def is_whitespace(symbol):
    return is_space(symbol) or is_newline(symbol)


# Location of a token
class TokenLocation:
    def __init__(self, line, column, file):
        self.line = line
        self.column = column
        self.file = file

    def __repr__(self):
        return "TokenLocation(line %i, column %i, file '%s')" % (  # pragma: no mutate
            self.line,
            self.column,
            self.file,
        )

    def __eq__(self, other):
        return (
            self.line == other.line
            and self.column == other.column
            and self.file == other.file
        )


# The type of a token
class TokenType(enum.Enum):
    UNKNOWN = enum.auto()  # pragma: no mutate
    SPACE = enum.auto()  # pragma: no mutate
    NEWLINE = enum.auto()  # pragma: no mutate
    BOOL = enum.auto()  # pragma: no mutate
    KEYWORD = enum.auto()  # pragma: no mutate
    SHEBANG = enum.auto()  # pragma: no mutate


# Represents a tokenizer token
class Token:
    def __init__(self, value, location, type):
        self.value = value
        self.location = location
        self.type = type

    def __repr__(self):
        return "Token(value %s, location %s, type %s)" % (  # pragma: no mutate
            repr(self.value),
            repr(self.location),
            str(self.type),
        )

    def __eq__(self, other):
        return (
            self.value == other.value
            and self.location == other.location
            and self.type == other.type
        )


# Splits text in to a list of characters and whitespace
def split_tokens(input):
    if input == "":
        return []
    tokens = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    location = TokenLocation(1, 1, "")
    type = TokenType.UNKNOWN
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            tokens.append(Token(current, location, type))
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each character
            tokens.append(Token(current, location, type))
            current = c
        else:
            # Token mode builds the current buffer
            current += c
    tokens.append(Token(current, location, type))
    return tokens


# Keywords recognized by the language
keywords = [
    "NewLang",
    "Done",
    "Set",
    "To",
    "EndSet",
    "If",
    "Then",
    "Else",
    "EndIf",
    "StartNote",
    "EndNote",
    "StartText",
    "EndText",
]


# Classifies tokens in to types
def classify_tokens(tokens):
    new_tokens = []
    for t in tokens:
        if is_newline(t.value):
            type = TokenType.NEWLINE
        elif is_space(t.value):
            type = TokenType.SPACE
        elif t.value in ["True", "False"]:
            type = TokenType.BOOL
        elif t.value in keywords:
            type = TokenType.KEYWORD
        elif t.value[0:2] == "#!":
            type = TokenType.SHEBANG
        else:
            type = TokenType.UNKNOWN
        new = Token(t.value, t.location, type)
        new_tokens.append(new)
    return new_tokens


# Generates a list of tokens with locations
def locate_tokens(tokens, filename):
    new_tokens = []
    line = 1
    column = 1
    for t in tokens:
        location = TokenLocation(line, column, filename)
        new = Token(t.value, location, t.type)
        new_tokens.append(new)
        if t.type == TokenType.NEWLINE:
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return new_tokens


# Tokenizes source code
def tokenize(source, filename):
    split = split_tokens(source)
    classified = classify_tokens(split)
    located = locate_tokens(classified, filename)
    return located