NewLang/tests/test_tokenize.py at 87007bb8d4e34f4f1b0c1e73779903f4c786200c

Fork: 0
LuminaSensum / NewLang
Find file
Newer
Older
NewLang / tests / test_tokenize.py
Jookia on 21 Apr 2022 4 KB syntax: Rename SyntaxLocation's column to offset
Raw Blame History
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import given
from hypothesis.strategies import (
    booleans,
    composite,
    just,
    lists,
    one_of,
    text,
)

from src import tokenize
from src.syntax import Syntax, SyntaxLocation, SyntaxType
from tests.test_syntax import (
    draw_token_classified,
    draw_token_newline,
    draw_token_space,
    draw_token_unknown,
)


# Draws a token using an existing strategy but with a blank location just like split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy)
    location = SyntaxLocation(1, 1, "")
    return Syntax(token.value, location, SyntaxType.TOKEN)


# Generates an alternating sequence of unknown or whitespace tokens
# intended for splitting in to separate tokens
@composite
def draw_tokens_to_split(draw):
    source = ""
    tokens = []
    elements = draw(lists(just(True)))
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            # Multiple whitespaces get split in to multiple tokens
            strategy = one_of([draw_token_space(), draw_token_newline()])
            locationed = draw_token_splitted(strategy)
            tokens += draw(lists(locationed, min_size=1))
        else:
            strategy = draw_token_unknown
            locationed = draw_token_splitted(strategy())
            tokens.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    for t in tokens:
        source += t.value
    return (source, tokens)


# Test that the tokenizer can split tokens properly
# We expect the following behaviour:
# - Whitespace is a tab, space or new line
# - Non-whitespace is anything else
# - Whitespace and non-whitespace are separated in to tokens
# - Whitespace characters are split in to multiple tokens
# - Non-whitespace characters are combined in to a single token
# - Each token location is line 1 offset 1 of file ""
@given(draw_tokens_to_split())
def test_tokenize_split_tokens(test_data):
    (source, tokens) = test_data
    assert tokenize.split_tokens(source) == tokens


# Generates a list of tokens and their corrected locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(lists(draw_token_classified()))
    filename = draw(text())
    located = []
    line = 1
    offset = 1
    for t in tokens:
        location = SyntaxLocation(line, offset, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        located.append(new)
        if t.value == "\n":
            line = line + 1
            offset = 1
        else:
            offset += len(t.value)
    return (tokens, located, filename)


# Test that the tokenizer can determine locations
# We expect the following behaviour:
# - Only the token location is modified
# - Each token's location filename is the generated filename
# - A location's line is equal to 1 plus the number of tokens with the
#   value '\n' before the token
# - A location's offset is equal to 1 plus the count of previous token's
#   value code points 'before' it, where 'before' is defined as any
#   token between the last '\n' (or start of file) before the token
#   and the token itself
@given(draw_tokens_locations())
def test_tokenize_locations(test_data):
    (input, located, filename) = test_data
    assert tokenize.locate_tokens(input, filename) == located


# Draws a token and possibly add garbage
# This is to ensure that tokens must completely match a value
@composite
def draw_token_classified_garbled(draw):
    token = draw(draw_token_classified())
    value = token.value
    if draw(booleans()):
        value = draw(text(min_size=1)) + value
    if draw(booleans()):
        value = value + draw(text(min_size=1))
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draw a random string made of token values
@composite
def draw_source_fuzz(draw):
    tokens = draw(lists(draw_token_classified()))
    input = ""
    for t in tokens:
        input += t.value
    return input


# Test that the tokenize function behaves as we expect
# We expect the following behaviour:
# - The tokenizer splits the tokens as expected, then
# - The tokenizer sets the token locations as expected
@given(draw_source_fuzz(), text())
def test_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    located = tokenize.locate_tokens(split, filename)
    tokenized = tokenize.tokenize(source, filename)
    assert located == tokenized