Newer
Older
NewLang / tests / test_tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import given, assume
from hypothesis.strategies import (
    booleans,
    characters,
    composite,
    integers,
    just,
    lists,
    one_of,
    sampled_from,
    text,
)

from src import tokenize
from src.syntax import Syntax, SyntaxLocation, SyntaxType


# Keywords recognized by the language
keywords = [
    "NewLang",
    "Done",
    "Set",
    "To",
    "EndSet",
    "If",
    "Then",
    "Else",
    "EndIf",
    "StartNote",
    "EndNote",
    "StartText",
    "EndText",
]


# Draws a random token location
@composite
def draw_token_location(draw):
    line = draw(integers())
    column = draw(integers())
    filename = draw(text())
    return SyntaxLocation(line, column, filename)


# Draws a random syntax type
@composite
def draw_syntax_type(draw):
    return draw(sampled_from(list(SyntaxType)))


# Draws a token syntax value
@composite
def draw_syntax_token(draw):
    value = draw(draw_token_classified())
    location = draw(draw_token_location())
    type = SyntaxType.TOKEN
    return Syntax(value.value, location, type)


# Draws a text syntax value
@composite
def draw_syntax_text(draw):
    value = draw(text())
    location = draw(draw_token_location())
    type = SyntaxType.TEXT
    return Syntax(value, location, type)


# Draws a random syntax
@composite
def draw_syntax_random(draw):
    strategies = [
        draw_syntax_token(),
        draw_syntax_text(),
    ]
    return draw(one_of(strategies))


# Test syntax getters
@given(text(), draw_token_location(), draw_syntax_type())
def test_tokenize_syntax_getters(value, location, type):
    # Use text as a somewhat random value
    test = Syntax(value, location, type)
    assert test.value == value
    assert test.location == location
    assert test.type == type


# Test syntax equals
@given(draw_syntax_random(), draw_syntax_random())
def test_tokenize_syntax_equality(syntax1, syntax2):
    equals = (
        syntax1.type == syntax2.type
        and syntax1.value == syntax2.value
        and syntax1.location == syntax2.location
    )
    assert (syntax1 == syntax2) == equals


# Draws a random token
@composite
def draw_token_random(draw):
    value = draw(text())
    location = draw(draw_token_location())
    return Syntax(value, location, SyntaxType.TOKEN)


# Draws an unknown token
@composite
def draw_token_unknown(draw):
    reserved = " \n\t"
    token = draw(draw_token_random())
    chars = characters(blacklist_characters=reserved)
    value = draw(text(alphabet=chars, min_size=1))
    assume(value not in ["True", "False"])
    assume(value not in keywords)
    assume(value[0:2] != "#!")
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a space token
@composite
def draw_token_space(draw):
    space = " \t"
    token = draw(draw_token_random())
    value = draw(sampled_from(space))
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a new line token
@composite
def draw_token_newline(draw):
    token = draw(draw_token_random())
    value = "\n"
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a bool token
@composite
def draw_token_bool(draw):
    token = draw(draw_token_random())
    if draw(booleans()):
        value = "True"
    else:
        value = "False"
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a keyword token
@composite
def draw_token_keyword(draw):
    token = draw(draw_token_random())
    value = draw(sampled_from(keywords))
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a shebang token
@composite
def draw_token_shebang(draw):
    token = draw(draw_token_random())
    value = "#!" + draw(text())
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draws a classified token
@composite
def draw_token_classified(draw):
    strategies = [
        draw_token_unknown(),
        draw_token_space(),
        draw_token_newline(),
        draw_token_bool(),
        draw_token_keyword(),
        draw_token_shebang(),
    ]
    token = draw(one_of(strategies))
    return token


# Test location getters
@given(integers(), integers(), text())
def test_tokenize_location_getters(line, column, filename):
    test = SyntaxLocation(line, column, filename)
    assert test.line == line
    assert test.column == column
    assert test.file == filename


# Test location equals
@given(draw_token_location(), draw_token_location())
def test_tokenize_location_equality(location1, location2):
    equals = (
        location1.line == location2.line
        and location1.column == location2.column
        and location1.file == location2.file
    )
    assert (location1 == location2) == equals


# Test token getters
@given(text(), draw_token_location())
def test_tokenize_token_getters(value, location):
    test = Syntax(value, location, SyntaxType.TOKEN)
    assert test.value == value
    assert test.location == location


# Test token equals
@given(draw_token_random(), draw_token_random())
def test_tokenize_token_equality(token1, token2):
    equals = token1.value == token2.value and token1.location == token2.location
    assert (token1 == token2) == equals


# Draws a token using an existing strategy but with a blank location just like split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy)
    location = SyntaxLocation(1, 1, "")
    return Syntax(token.value, location, SyntaxType.TOKEN)


# Generates an alternating sequence of unknown or whitespace tokens
# intended for splitting in to separate tokens
@composite
def draw_tokens_to_split(draw):
    source = ""
    tokens = []
    elements = draw(lists(just(True)))
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            # Multiple whitespaces get split in to multiple tokens
            strategy = one_of([draw_token_space(), draw_token_newline()])
            locationed = draw_token_splitted(strategy)
            tokens += draw(lists(locationed, min_size=1))
        else:
            strategy = draw_token_unknown
            locationed = draw_token_splitted(strategy())
            tokens.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    for t in tokens:
        source += t.value
    return (source, tokens)


# Test that the tokenizer can split tokens properly
# We expect the following behaviour:
# - Whitespace is a tab, space or new line
# - Non-whitespace is anything else
# - Whitespace and non-whitespace are separated in to tokens
# - Whitespace characters are split in to multiple tokens
# - Non-whitespace characters are combined in to a single token
# - Each token location is line 1 column 1 of file ""
@given(draw_tokens_to_split())
def test_tokenize_split_tokens(test_data):
    (source, tokens) = test_data
    assert tokenize.split_tokens(source) == tokens


# Generates a list of tokens and their corrected locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(lists(draw_token_classified()))
    filename = draw(text())
    located = []
    line = 1
    column = 1
    for t in tokens:
        location = SyntaxLocation(line, column, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        located.append(new)
        if t.value == "\n":
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return (tokens, located, filename)


# Test that the tokenizer can determine locations
# We expect the following behaviour:
# - Only the token location is modified
# - Each token's location filename is the generated filename
# - A token's line is equal to 1 plus the number of tokens with the
#   value '\n' before the token
# - A token's column is equal to 1 plus the sum of previous token's
#   value lengths 'before' it, where 'before' is defined as any
#   token between the last '\n' (or start of file) before the token
#   and the token itself
@given(draw_tokens_locations())
def test_tokenize_locations(test_data):
    (input, located, filename) = test_data
    assert tokenize.locate_tokens(input, filename) == located


# Draws a token and possibly add garbage
# This is to ensure that tokens must completely match a value
@composite
def draw_token_classified_garbled(draw):
    token = draw(draw_token_classified())
    value = token.value
    if draw(booleans()):
        value = draw(text(min_size=1)) + value
    if draw(booleans()):
        value = value + draw(text(min_size=1))
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draw a random string made of token values
@composite
def draw_source_fuzz(draw):
    tokens = draw(lists(draw_token_classified()))
    input = ""
    for t in tokens:
        input += t.value
    return input


# Test that the tokenize function behaves as we expect
# We expect the following behaviour:
# - The tokenizer splits the tokens as expected, then
# - The tokenizer sets the token locations as expected
@given(draw_source_fuzz(), text())
def test_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    located = tokenize.locate_tokens(split, filename)
    tokenized = tokenize.tokenize(source, filename)
    assert located == tokenized