Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from src.syntax import Syntax, SyntaxLocation, SyntaxType


# Checks whether a symbol is space
def is_space(symbol):
    return symbol == " " or symbol == "\t"


# Checks whether a symbol is a new line
def is_newline(symbol):
    return symbol == "\n"


# Checks whether a symbol is general whitespace
def is_whitespace(symbol):
    return is_space(symbol) or is_newline(symbol)


# Splits text in to a list of characters and whitespace
def split_tokens(input):
    if input == "":
        return []
    tokens = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    location = SyntaxLocation(1, 1, "")
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each character
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
        else:
            # Token mode builds the current buffer
            current += c
    tokens.append(Syntax(current, location, SyntaxType.TOKEN))
    return tokens


# Generates a list of tokens with locations
def locate_tokens(tokens, filename):
    new_tokens = []
    line = 1
    column = 1
    for t in tokens:
        location = SyntaxLocation(line, column, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        new_tokens.append(new)
        if is_newline(t.value):
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return new_tokens


# Tokenizes source code
def tokenize(source, filename):
    split = split_tokens(source)
    located = locate_tokens(split, filename)
    return located