Newer
Older
NewLang / tests / test_tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import given, assume
from hypothesis.strategies import (
    booleans,
    characters,
    composite,
    integers,
    just,
    lists,
    one_of,
    sampled_from,
    text,
)

from src import tokenize


# Keywords recognized by the language
keywords = [
    "NewLang",
    "Done",
    "Set",
    "To",
    "EndSet",
    "If",
    "Then",
    "Else",
    "EndIf",
    "StartNote",
    "EndNote",
    "StartText",
    "EndText",
]


# Draws a random token location
@composite
def draw_token_location(draw):
    line = draw(integers())
    column = draw(integers())
    filename = draw(text())
    return tokenize.TokenLocation(line, column, filename)


# Draws a random token type
@composite
def draw_token_type(draw):
    return draw(sampled_from(list(tokenize.TokenType)))


# Draws a random token
@composite
def draw_token_random(draw):
    value = draw(text())
    location = draw(draw_token_location())
    type = draw(draw_token_type())
    return tokenize.Token(value, location, type)


# Draws an unknown token
@composite
def draw_token_unknown(draw):
    reserved = " \n\t"
    token = draw(draw_token_random())
    chars = characters(blacklist_characters=reserved)
    value = draw(text(alphabet=chars, min_size=1))
    assume(value not in ["True", "False"])
    assume(value not in keywords)
    assume(value[0:2] != "#!")
    type = tokenize.TokenType.UNKNOWN
    return tokenize.Token(value, token.location, type)


# Draws a space token
@composite
def draw_token_space(draw):
    space = " \t"
    token = draw(draw_token_random())
    value = draw(sampled_from(space))
    type = tokenize.TokenType.SPACE
    return tokenize.Token(value, token.location, type)


# Draws a new line token
@composite
def draw_token_newline(draw):
    token = draw(draw_token_random())
    value = "\n"
    type = tokenize.TokenType.NEWLINE
    return tokenize.Token(value, token.location, type)


# Draws a bool token
@composite
def draw_token_bool(draw):
    token = draw(draw_token_random())
    if draw(booleans()):
        value = "True"
    else:
        value = "False"
    type = tokenize.TokenType.BOOL
    return tokenize.Token(value, token.location, type)


# Draws a keyword token
@composite
def draw_token_keyword(draw):
    token = draw(draw_token_random())
    value = draw(sampled_from(keywords))
    type = tokenize.TokenType.KEYWORD
    return tokenize.Token(value, token.location, type)


# Draws a shebang token
@composite
def draw_token_shebang(draw):
    token = draw(draw_token_random())
    value = "#!" + draw(text())
    type = tokenize.TokenType.SHEBANG
    return tokenize.Token(value, token.location, type)


# Draws a classified token
@composite
def draw_token_classified(draw):
    strategies = [
        draw_token_unknown(),
        draw_token_space(),
        draw_token_newline(),
        draw_token_bool(),
        draw_token_keyword(),
        draw_token_shebang(),
    ]
    token = draw(one_of(strategies))
    return token


# Test location getters
@given(integers(), integers(), text())
def test_tokenize_location_getters(line, column, filename):
    test = tokenize.TokenLocation(line, column, filename)
    assert test.line == line
    assert test.column == column
    assert test.file == filename


# Test location equals
@given(draw_token_location(), draw_token_location())
def test_tokenize_location_equality(location1, location2):
    equals = (
        location1.line == location2.line
        and location1.column == location2.column
        and location1.file == location2.file
    )
    assert (location1 == location2) == equals


# Test token getters
@given(text(), draw_token_location(), draw_token_type())
def test_tokenize_token_getters(value, location, type):
    test = tokenize.Token(value, location, type)
    assert test.value == value
    assert test.location == location
    assert test.type == type


# Test token equals
@given(draw_token_random(), draw_token_random())
def test_tokenize_token_equality(token1, token2):
    equals = (
        token1.value == token2.value
        and token1.location == token2.location
        and token1.type == token2.type
    )
    assert (token1 == token2) == equals


# Draws a token using an existing strategy but with a blank location
# and unknown type, just like split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy)
    location = tokenize.TokenLocation(1, 1, "")
    type = tokenize.TokenType.UNKNOWN
    return tokenize.Token(token.value, location, type)


# Generates an alternating sequence of unknown or whitespace tokens
# intended for splitting in to separate tokens
@composite
def draw_tokens_to_split(draw):
    source = ""
    tokens = []
    elements = draw(lists(just(True)))
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            # Multiple whitespaces get split in to multiple tokens
            strategy = one_of([draw_token_space(), draw_token_newline()])
            locationed = draw_token_splitted(strategy)
            tokens += draw(lists(locationed, min_size=1))
        else:
            strategy = draw_token_unknown
            locationed = draw_token_splitted(strategy())
            tokens.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    for t in tokens:
        source += t.value
    return (source, tokens)


# Test that the tokenizer can split tokens properly
# We expect the following behaviour:
# - Whitespace is a tab, space or new line
# - Non-whitespace is anything else
# - Whitespace and non-whitespace are separated in to tokens
# - Whitespace characters are split in to multiple tokens
# - Non-whitespace characters are combined in to a single token
# - Each token type is UNKNOWN
# - Each token location is line 1 column 1 of file ""
@given(draw_tokens_to_split())
def test_tokenize_split_tokens(test_data):
    (source, tokens) = test_data
    assert tokenize.split_tokens(source) == tokens


# Generates a list of tokens and their corrected locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(lists(draw_token_random()))
    filename = draw(text())
    located = []
    line = 1
    column = 1
    for t in tokens:
        location = tokenize.TokenLocation(line, column, filename)
        new = tokenize.Token(t.value, location, t.type)
        located.append(new)
        if t.type == tokenize.TokenType.NEWLINE:
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return (tokens, located, filename)


# Test that the tokenizer can determine locations
# We expect the following behaviour:
# - Only the token location is modified
# - Each token's location filename is the generated filename
# - A token's line is equal to 1 plus the number of tokens with the
#   type NEWLINE before the token
# - A token's column is equal to 1 plus the sum of previous token's
#   value lengths 'before' it, where 'before' is defined as any
#   token between the last NEWLINE (or start of file) before the token
#   and the token itself
@given(draw_tokens_locations())
def test_tokenize_locations(test_data):
    (input, located, filename) = test_data
    assert tokenize.locate_tokens(input, filename) == located


# Draws a token and possibly adds garbage
# This is to ensure that tokens must completely match a value
@composite
def draw_token_classified_garbled(draw):
    token = draw(draw_token_classified())
    value = token.value
    type = token.type
    if draw(booleans()):
        value = draw(text(min_size=1)) + value
        type = tokenize.TokenType.UNKNOWN
    if draw(booleans()):
        value = value + draw(text(min_size=1))
        type = tokenize.TokenType.UNKNOWN
    if value[0:2] == "#!":
        type = tokenize.TokenType.SHEBANG
    return tokenize.Token(value, token.location, type)


# Generates a list of classified tokens and incorrectly classified tokens
@composite
def draw_tokens_classified(draw):
    tokens = draw(lists(draw_token_classified_garbled()))
    input = []
    for t in tokens:
        type = draw(draw_token_type())
        input.append(tokenize.Token(t.value, t.location, type))
    return (input, tokens)


# Test that classification can add types properly
# We expect the following behaviour:
# - Only the token type is modified
# - The token's type is set based on which following condition its value meets:
#   - KEYWORD if it entirely matches a keyword
#   - BOOL if it is True or False
#   - WHITESPACE if it is a space or tab character
#   - NEWLINE if it is a new line character
#   - SHEBANG if it starts with a "#!"
#   - UNKNOWN if it is none of the above
@given(draw_tokens_classified())
def test_tokenize_classification(test_data):
    (input, tokens) = test_data
    assert tokenize.classify_tokens(input) == tokens


# Draw a random string made of token values
@composite
def draw_source_fuzz(draw):
    tokens = draw(lists(draw_token_classified()))
    input = ""
    for t in tokens:
        input += t.value
    return input


# Test that the tokenize function behaves as we expect
# We expect the following behaviour:
# - The tokenizer splits the tokens as expected, then
# - The tokenizer classifies tokens as expected, then
# - The tokenizer sets the token locations as expected
@given(draw_source_fuzz(), text())
def test_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    classified = tokenize.classify_tokens(split)
    located = tokenize.locate_tokens(classified, filename)
    tokenized = tokenize.tokenize(source, filename)
    assert located == tokenized