NewLang/tests/test_tokenize.py at 31ae0720eea5e88a212d2bd5dbe9feab60c76f5c

Fork: 0
LuminaSensum / NewLang
Find file
Newer
Older
NewLang / tests / test_tokenize.py
Jookia on 7 Feb 2022 5 KB tokenize: Add tokenize wrapper function
Raw Blame History
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import given
from hypothesis.strategies import (
    booleans,
    characters,
    composite,
    integers,
    just,
    lists,
    one_of,
    sampled_from,
    text,
)

from src import tokenize

# Whitespace that separates lexer words
whitespace = " \n\t"


# Draws a random token location
@composite
def draw_token_location(draw):
    line = draw(integers())
    column = draw(integers())
    filename = draw(text())
    return tokenize.TokenLocation(line, column, filename)


# Test location getters
@given(integers(), integers(), text())
def test_tokenize_location_getters(line, column, filename):
    test = tokenize.TokenLocation(line, column, filename)
    assert test.line == line
    assert test.column == column
    assert test.file == filename


# Test location equals
@given(draw_token_location(), draw_token_location())
def test_tokenize_location_equality(location1, location2):
    equals = (
        location1.line == location2.line
        and location1.column == location2.column
        and location1.file == location2.file
    )
    assert (location1 == location2) == equals


# Draws a random token type
@composite
def draw_token_type(draw):
    return draw(sampled_from(list(tokenize.TokenType)))


# Draws a random token
@composite
def draw_token_random(draw):
    value = draw(text())
    location = draw(draw_token_location())
    type = draw(draw_token_type())
    return tokenize.Token(value, location, type)


# Test token getters
@given(text(), draw_token_location(), draw_token_type())
def test_tokenize_token_getters(value, location, type):
    test = tokenize.Token(value, location, type)
    assert test.value == value
    assert test.location == location
    assert test.type == type


# Test token equals
@given(draw_token_random(), draw_token_random())
def test_tokenize_token_equality(token1, token2):
    equals = (
        token1.value == token2.value
        and token1.location == token2.location
        and token1.type == token2.type
    )
    assert (token1 == token2) == equals


# Draws a tokenizer non-whitespace token
@composite
def draw_token_nonwhitespace(draw):
    chars = characters(blacklist_characters=whitespace)
    value = draw(text(alphabet=chars, min_size=1))
    location = draw(draw_token_location())
    type = tokenize.TokenType.UNKNOWN
    return tokenize.Token(value, location, type)


# Draws a tokenizer whitespace token
@composite
def draw_token_whitespace(draw):
    value = draw(sampled_from(whitespace))
    location = draw(draw_token_location())
    type = tokenize.TokenType.WHITESPACE
    return tokenize.Token(value, location, type)


# Draws a token with the values split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy())
    location = tokenize.TokenLocation(1, 1, "")
    type = tokenize.TokenType.UNKNOWN
    return tokenize.Token(token.value, location, type)


# Generates an alternating sequence of tokens
@composite
def draw_tokens_list(draw):
    output = []
    elements = draw(lists(just(True)))
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            strategy = draw_token_whitespace
            locationed = draw_token_splitted(strategy)
            output += draw(lists(locationed, min_size=1))
        else:
            strategy = draw_token_nonwhitespace
            locationed = draw_token_splitted(strategy)
            output.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    return output


# Test that the tokenizer can split tokens properly
@given(draw_tokens_list())
def test_tokenize_split_tokens(tokens):
    input = ""
    for t in tokens:
        input += t.value
    assert tokenize.split_tokens(input) == tokens


# Generates a list of tokens with correct locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(draw_tokens_list())
    filename = draw(text())
    new_tokens = []
    line = 1
    column = 1
    for t in tokens:
        location = tokenize.TokenLocation(line, column, filename)
        new = tokenize.Token(t.value, location, t.type)
        new_tokens.append(new)
        if t.value == "\n":
            line = line + 1
            column = 1
        else:
            column += len(t.value)
    return new_tokens


# Test that we the tokenizer can determine locations
@given(draw_tokens_locations())
def test_tokenize_locations(tokens):
    input = []
    filename = ""
    location = tokenize.TokenLocation(1, 1, "")
    for t in tokens:
        input.append(tokenize.Token(t.value, location, t.type))
        filename = t.location.file
    assert tokenize.locate_tokens(input, filename) == tokens


# Draw a list of tokens with correct types and unknown types
@composite
def draw_tokens_classified(draw):
    strategies = [
        draw_token_nonwhitespace(),
        draw_token_whitespace(),
    ]
    tokens = draw(lists(one_of(strategies)))
    input = []
    for t in tokens:
        type = draw(draw_token_type())
        input.append(tokenize.Token(t.value, t.location, type))
    return (input, tokens)


# Test that classification can add types properly
@given(draw_tokens_classified())
def test_tokenize_classification(test_data):
    (input, tokens) = test_data
    assert tokenize.classify_tokens(input) == tokens


@composite
def draw_source_fuzz(draw):
    strategies = [
        draw_token_nonwhitespace(),
        draw_token_whitespace(),
    ]
    tokens = draw(lists(one_of(strategies)))
    input = ""
    for t in tokens:
        input += t.value
    return input


@given(draw_source_fuzz(), text())
def test_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    located = tokenize.locate_tokens(split, filename)
    classified = tokenize.classify_tokens(located)
    tokenized = tokenize.tokenize(source, filename)
    assert classified == tokenized