Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

# Checks whether a symbol is whitespace
def is_whitespace(symbol):
    return symbol == " " or symbol == "\t" or symbol == "\n"


# Splits text in to a list of characters and whitespace
def split_symbols(input):
    if input == "":
        return []
    symbols = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            symbols.append(current)
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each character
            symbols.append(current)
            current = c
        else:
            # Symbol mode builds the current buffer
            current += c
    symbols.append(current)
    return symbols