NewLang/tests/parse2/test_tokenize.py at 2904a6f7f2dcb908b030efffa8efd57245216de6

Fork: 0
LuminaSensum / NewLang
Find file
Newer
Older
NewLang / tests / parse2 / test_tokenize.py
Jookia on 13 Feb 2023 7 KB src: Move all source code to src/newlang
Raw Blame History
# SPDX-License-Identifier: LGPL-2.1-only
# Copyright 2022 Jookia <contact@jookia.org>

from hypothesis import assume, given
from hypothesis.strategies import (
    booleans,
    characters,
    composite,
    just,
    lists,
    one_of,
    sampled_from,
    text,
)

from newlang.parse2 import tokenize
from newlang.parse2.token import Token, TokenLocation
from tests.parse2.test_token import static_token_location


# Values considered spaces
valid_spaces = [
    "\t",  # U+0009 HORIZONTAL TAB
    " ",  # U+0020 SPACE
]

# Single values reserved for new line use
single_newlines = [
    "\n",  # U+000A LINE FEED
    "\v",  # U+000B VERTICAL TAB
    "\f",  # U+000C FORM FEED
    "\r",  # U+000D CARRIAGE RETURN
    "\u0085",  # U+0085 NEXT LINE
    "\u2028",  # U+2028 LINE SEPARATOR
    "\u2029",  # U+2029 PARAGRAPH SEPARATOR
]

# Multi values reserved for new line use
multi_newlines = [
    "\r\n",  # U+000A U+000D CARRIAGE RETURN then LINE FEED
]

# All values reserved for new line use
valid_newlines = single_newlines + multi_newlines


# Draws a space token
@composite
def draw_token_space(draw):
    location = static_token_location()
    value = draw(sampled_from(valid_spaces))
    return Token(value, location)


# Draws a new line token
@composite
def draw_token_newline(draw):
    location = static_token_location()
    value = draw(sampled_from(valid_newlines))
    return Token(value, location)


# Draws a random token without whitespace
@composite
def draw_token_nospace(draw):
    reserved = valid_spaces + single_newlines
    location = static_token_location()
    chars = characters(blacklist_characters=reserved)
    value = draw(text(alphabet=chars, min_size=1))
    for v in multi_newlines:
        assume(v not in value)
    return Token(value, location)


# Draws a random token perhaps with whitespaces
@composite
def draw_token_maybespace(draw):
    strategies = [
        draw_token_space(),
        draw_token_newline(),
        draw_token_nospace(),
    ]
    return draw(one_of(strategies))


# Draws a token using an existing strategy but with a blank location just like split_tokens outputs
@composite
def draw_token_splitted(draw, strategy):
    token = draw(strategy)
    location = TokenLocation(1, 1, "")
    return Token(token.value, location)


# Merges \r and \n tokens to \r\n tokens
def merge_crlf(tokens):
    if len(tokens) < 2:
        return tokens
    prev = tokens[0]
    merged = []
    for curr in tokens[1:]:
        if prev.value == "\r" and curr.value == "\n":
            # Previous token is \r, don't append it
            # Instead promote this \n token to \r\n
            prev = Token("\r\n", prev.location)
        else:
            # Append the previous token
            merged.append(prev)
            prev = curr
    merged.append(prev)
    return merged


# Generates an alternating sequence of unknown or whitespace tokens
# intended for splitting in to separate tokens
@composite
def draw_tokens_to_split(draw):
    source = ""
    tokens = []
    elements = draw(lists(just(True)))  # Dummy list for sizing
    drawing_whitespace = draw(booleans())
    for _ in elements:
        if drawing_whitespace:
            # Multiple whitespaces get split in to multiple tokens
            strategy = one_of([draw_token_space(), draw_token_newline()])
            locationed = draw_token_splitted(strategy)
            spaces = draw(lists(locationed, min_size=1))
            tokens += merge_crlf(spaces)
        else:
            locationed = draw_token_splitted(draw_token_nospace())
            tokens.append(draw(locationed))
        drawing_whitespace = not drawing_whitespace
    for t in tokens:
        source += t.value
    return (source, tokens)


# Test that the tokenizer can split tokens properly
# We expect the following behaviour:
# - Whitespace is any of the following Unicode sequences:
#   U+0009 HORIZONTAL TAB
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0020 SPACE
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
# - Non-whitespace is anything else
# - Whitespace and non-whitespace are separated in to separate tokens
# - Whitespace sequences are split in to multiple adjacent tokens
# - Non-whitespace code points are combined in to a single token
# - Each token location is line 1 offset 1 of file ""
@given(draw_tokens_to_split())
def test_parse2_tokenize_split_tokens(test_data):
    (source, tokens) = test_data
    assert tokenize.split_tokens(source) == tokens


# Generates a list of tokens with correct locations
@composite
def draw_tokens_locations(draw):
    tokens = draw(lists(draw_token_maybespace()))
    filename = draw(text())
    located = []
    line = 1
    offset = 1
    for t in tokens:
        location = TokenLocation(line, offset, filename)
        new = Token(t.value, location)
        located.append(new)
        if t.value in valid_newlines:
            line = line + 1
            offset = 1
        else:
            offset += len(t.value)
    return (tokens, located, filename)


# Test that the tokenizer can determine locations
# We expect the following behaviour:
# - New line tokens are tokens with one of the following Unicode sequences:
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
# - Only the token location is modified
# - Each token's location filename is the generated filename
# - A token's line is equal to 1 plus the number of new line tokens
#   before the token
# - A token's offset is equal to 1 plus the sum of previous token's
#   value lengths 'before' it, where 'before' is defined as any
#   token between the last new line token (or start of file) before the token
#   and the token itself
@given(draw_tokens_locations())
def test_parse2_tokenize_locations(test_data):
    (input, located, filename) = test_data
    assert tokenize.locate_tokens(input, filename) == located


# Generates two list of tokens: One with whitespace and one without
@composite
def draw_tokens_whitespace(draw):
    input = draw(lists(draw_token_maybespace()))
    stripped = []
    for t in input:
        is_whitespace = t.value in valid_spaces or t.value in valid_newlines
        if not is_whitespace:
            stripped.append(t)
    return (input, stripped)


# Test that the tokenizer can strip whitespace correctly
# We expect the following behaviour:
# - No tokens are modified
# - Tokens with the following values are removed from the output:
#   U+0009 HORIZONTAL TAB
#   U+000A LINE FEED
#   U+000B VERTICAL TAB
#   U+000C FORM FEED
#   U+000D CARRIAGE RETURN
#   U+000D U+000A CARRIAGE RETURN then LINE FEED
#   U+0020 SPACE
#   U+0085 NEXT LINE
#   U+2028 LINE SEPARATOR
#   U+2029 PARAGRAPH SEPARATOR
@given(draw_tokens_whitespace())
def test_parse2_tokenize_strip_whitespace(test_data):
    (input, tokens) = test_data
    assert tokenize.strip_whitespace(input) == tokens


# Draw a random string made of token values
@composite
def draw_source_fuzz(draw):
    tokens = draw(lists(draw_token_maybespace()))
    input = ""
    for t in tokens:
        input += t.value
    return input


# Test that the tokenize function behaves as we expect
# We expect the following behaviour:
# - The tokenizer splits the tokens as expected, then
# - The tokenizer sets the token locations as expected
@given(draw_source_fuzz(), text())
def test_parse2_tokenize_fuzz(source, filename):
    split = tokenize.split_tokens(source)
    located = tokenize.locate_tokens(split, filename)
    stripped = tokenize.strip_whitespace(located)
    tokenized = tokenize.tokenize(source, filename)
    assert stripped == tokenized