Newer
Older
NewLang / src / tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from src.syntax import Syntax, SyntaxLocation, SyntaxType

# Valid space code points
spaces = [
    "\t",  # U+0009 HORIZONTAL TAB
    " ",  # U+0020 SPACE
]

# Valid new line tokens
newlines = [
    "\n",  # U+000A LINE FEED
    "\v",  # U+000B VERTICAL TAB
    "\f",  # U+000C FORM FEED
    "\r",  # U+000D CARRIAGE RETURN
    "\r\n",  # U+000A U+000D CARRIAGE RETURN then LINE FEED
    "\u0085",  # U+0085 NEXT LINE
    "\u2028",  # U+2028 LINE SEPARATOR
    "\u2029",  # U+2029 PARAGRAPH SEPARATOR
]


# Checks whether a symbol is general whitespace
def is_whitespace(symbol):
    return symbol in spaces or symbol in newlines


# Splits text in to a list of tokens and whitespace
def split_tokens(input):
    if input == "":
        return []
    tokens = []
    prev = input[0]
    buffer = prev
    location = SyntaxLocation(1, 1, "")
    for curr in input[1:]:
        curr_space = is_whitespace(curr)
        prev_space = is_whitespace(prev)
        switching = curr_space != prev_space
        crlf = prev == "\r" and curr == "\n"
        # Flush if we switch between whitespace and non-whitespace code points
        # Flush if we're working with a stream of whitespace
        # Don't flush if we're in the middle of a CR LF sequence
        flush = switching or (curr_space and not crlf)
        if flush:
            tokens.append(Syntax(buffer, location, SyntaxType.TOKEN))
            buffer = ""
        buffer += curr
        prev = curr
    tokens.append(Syntax(buffer, location, SyntaxType.TOKEN))
    return tokens


# Generates a list of tokens with locations
def locate_tokens(tokens, filename):
    new_tokens = []
    line = 1
    offset = 1
    for t in tokens:
        location = SyntaxLocation(line, offset, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        new_tokens.append(new)
        if t.value in newlines:
            line = line + 1
            offset = 1
        else:
            offset += len(t.value)
    return new_tokens


# Removes whitespace tokens
def strip_whitespace(syntax):
    output = []
    for s in syntax:
        if s.type == SyntaxType.TOKEN and not is_whitespace(s.value):
            output.append(s)
    return output


# Tokenizes source code
def tokenize(source, filename):
    split = split_tokens(source)
    located = locate_tokens(split, filename)
    stripped = strip_whitespace(located)
    return stripped