Newer
Older
NewLang / tests / test_tokenize.py
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import given
from hypothesis.strategies import (
    booleans,
    composite,
    just,
    lists,
    one_of,
    text,
)

from src import tokenize
from src.syntax import Syntax, SyntaxLocation, SyntaxType
from tests.test_syntax import (
    draw_syntax_token,
    draw_token_newline,
    draw_token_space,
    draw_token_unknown,
    valid_spaces,
    valid_newlines,
)


# Draws a token using an existing strategy but with a blank location just like split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy)
    location = SyntaxLocation(1, 1, "")
    return Syntax(token.value, location, SyntaxType.TOKEN)


# Merges \r and \n tokens to \r\n tokens
def merge_crlf(tokens):
    if len(tokens) < 2:
        return tokens
    prev = tokens[0]
    merged = []
    for curr in tokens[1:]:
        if prev.value == "\r" and curr.value == "\n":
            # Previous token is \r, don't append it
            # Instead promote this \n token to \r\n
            prev = Syntax("\r\n", prev.location, SyntaxType.TOKEN)
        else:
            # Append the previous token
            merged.append(prev)
            prev = curr
    merged.append(prev)
    return merged


# Generates an alternating sequence of unknown or whitespace tokens
# intended for splitting in to separate tokens
@composite
def draw_tokens_to_split(draw):
    source = ""
    tokens = []
    elements = draw(lists(just(True)))  # Dummy list for sizing
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            # Multiple whitespaces get split in to multiple tokens
            strategy = one_of([draw_token_space(), draw_token_newline()])
            locationed = draw_token_splitted(strategy)
            spaces = draw(lists(locationed, min_size=1))
            tokens += merge_crlf(spaces)
        else:
            strategy = draw_token_unknown
            locationed = draw_token_splitted(strategy())
            tokens.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    for t in tokens:
        source += t.value
    return (source, tokens)


# Test that the tokenizer can split tokens properly
# We expect the following behaviour:
# - Whitespace is any of the following Unicode sequences:
#   U+0009 HORIZONTAL TAB
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0020 SPACE
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
# - Non-whitespace is anything else
# - Whitespace and non-whitespace are separated in to separate tokens
# - Whitespace sequences are split in to multiple adjacent tokens
# - Non-whitespace code points are combined in to a single token
# - Each token location is line 1 offset 1 of file ""
@given(draw_tokens_to_split())
def test_tokenize_split_tokens(test_data):
    (source, tokens) = test_data
    assert tokenize.split_tokens(source) == tokens


# Generates a list of tokens with correct locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(lists(draw_syntax_token()))
    filename = draw(text())
    located = []
    line = 1
    offset = 1
    for t in tokens:
        location = SyntaxLocation(line, offset, filename)
        new = Syntax(t.value, location, SyntaxType.TOKEN)
        located.append(new)
        if t.value in valid_newlines:
            line = line + 1
            offset = 1
        else:
            offset += len(t.value)
    return (tokens, located, filename)


# Test that the tokenizer can determine locations
# We expect the following behaviour:
# - New line tokens are tokens with one of the following Unicode sequences:
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
# - Only the token location is modified
# - Each token's location filename is the generated filename
# - A token's line is equal to 1 plus the number of new line tokens
#   before the token
# - A token's offset is equal to 1 plus the sum of previous token's
#   value lengths 'before' it, where 'before' is defined as any
#   token between the last new line token (or start of file) before the token
#   and the token itself
@given(draw_tokens_locations())
def test_tokenize_locations(test_data):
    (input, located, filename) = test_data
    assert tokenize.locate_tokens(input, filename) == located


# Generates two list of tokens: One with whitespace and one without
@composite
def draw_tokens_whitespace(draw):
    input = draw(lists(draw_syntax_token()))
    stripped = []
    for s in input:
        is_whitespace = s.value in valid_spaces or s.value in valid_newlines
        if s.type != SyntaxType.TOKEN or not is_whitespace:
            stripped.append(s)
    return (input, stripped)


# Test that the tokenizer can strip whitespace correctly
# We expect the following behaviour:
# - No syntax is modified
# - Tokens with the following values are removed from the output:
#   U+0009 HORIZONTAL TAB
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0020 SPACE
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
@given(draw_tokens_whitespace())
def test_tokenize_strip_whitespace(test_data):
    (input, syntax) = test_data
    assert tokenize.strip_whitespace(input) == syntax


# Draws a token and possibly add garbage
# This is to ensure that tokens must completely match a value
@composite
def draw_syntax_token_garbled(draw):
    token = draw(draw_syntax_token())
    value = token.value
    if draw(booleans()):
        value = draw(text(min_size=1)) + value
    if draw(booleans()):
        value = value + draw(text(min_size=1))
    return Syntax(value, token.location, SyntaxType.TOKEN)


# Draw a random string made of token values
@composite
def draw_source_fuzz(draw):
    tokens = draw(lists(draw_syntax_token()))
    input = ""
    for t in tokens:
        input += t.value
    return input


# Test that the tokenize function behaves as we expect
# We expect the following behaviour:
# - The tokenizer splits the tokens as expected, then
# - The tokenizer sets the token locations as expected
@given(draw_source_fuzz(), text())
def test_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    located = tokenize.locate_tokens(split, filename)
    stripped = tokenize.strip_whitespace(located)
    tokenized = tokenize.tokenize(source, filename)
    assert stripped == tokenized