NewLang/src/tokenize.py at 69e038e72653c89821ca861e1f2dd55867e14698

Fork: 0
LuminaSensum / NewLang
Find file
Newer
Older
NewLang / src / tokenize.py
Jookia on 24 Apr 2022 2 KB tokenize: Add more new line symbols
Raw Blame History
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from src.syntax import Syntax, SyntaxLocation, SyntaxType

# Valid space code points
spaces = [
    "\t",  # U+0009 HORIZONTAL TAB
    " ",  # U+0020 SPACE
]

# Valid new line tokens
newlines = [
    "\n",  # U+000A LINE FEED
    "\v",  # U+000B VERTICAL TAB
    "\f",  # U+000C FORM FEED
    "\r",  # U+000D CARRIAGE RETURN
    "\u0085",  # U+0085 NEXT LINE
    "\u2028",  # U+2028 LINE SEPARATOR
    "\u2029",  # U+2029 PARAGRAPH SEPARATOR
]


# Checks whether a symbol is general whitespace
def is_whitespace(symbol):
    return symbol in spaces or symbol in newlines


# Splits text in to a list of tokens and whitespace
def split_tokens(input):
    if input == "":
        return []
    tokens = []
    current = input[0]
    curr_whitespace = is_whitespace(input[0])
    location = SyntaxLocation(1, 1, "")
    for c in input[1:]:
        c_whitespace = is_whitespace(c)
        if c_whitespace != curr_whitespace:
            # Flush current buffer and switch modes
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
            curr_whitespace = c_whitespace
        elif curr_whitespace:
            # Whitespace mode appends each code point
            tokens.append(Syntax(current, location, SyntaxType.TOKEN))
            current = c
        else:
            # Token mode builds the current buffer
            current += c
    tokens.append(Syntax(current, location, SyntaxType.TOKEN))
    return tokens


# Generates a list of tokens with locations
def locate_tokens(tokens, filename):
    new_tokens = []
    line = 1
    offset = 1
    for t in tokens:
        location = SyntaxLocation(line, offset, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        new_tokens.append(new)
        if t.value in newlines:
            line = line + 1
            offset = 1
        else:
            offset += len(t.value)
    return new_tokens


# Removes whitespace tokens
def strip_whitespace(syntax):
    output = []
    for s in syntax:
        if s.type == SyntaxType.TOKEN and not is_whitespace(s.value):
            output.append(s)
    return output


# Tokenizes source code
def tokenize(source, filename):
    split = split_tokens(source)
    located = locate_tokens(split, filename)
    stripped = strip_whitespace(located)
    return stripped