# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import given from hypothesis.strategies import ( booleans, composite, just, lists, one_of, text, ) from src import tokenize from src.syntax import Syntax, SyntaxLocation, SyntaxType from tests.test_syntax import ( draw_token_classified, draw_token_newline, draw_token_space, draw_token_unknown, ) # Draws a token using an existing strategy but with a blank location just like split_tokens outputs @composite def draw_token_splitted(draw, strategy): token = draw(strategy) location = SyntaxLocation(1, 1, "") return Syntax(token.value, location, SyntaxType.TOKEN) # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite def draw_tokens_to_split(draw): source = "" tokens = [] elements = draw(lists(just(True))) drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) tokens += draw(lists(locationed, min_size=1)) else: strategy = draw_token_unknown locationed = draw_token_splitted(strategy()) tokens.append(draw(locationed)) drawing_whitespace = not drawing_whitespace for t in tokens: source += t.value return (source, tokens) # Test that the tokenizer can split tokens properly # We expect the following behaviour: # - Whitespace is a tab, space or new line # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to tokens # - Whitespace characters are split in to multiple tokens # - Non-whitespace characters are combined in to a single token # - Each token location is line 1 offset 1 of file "" @given(draw_tokens_to_split()) def test_tokenize_split_tokens(test_data): (source, tokens) = test_data assert tokenize.split_tokens(source) == tokens # Generates a list of tokens and their corrected locations @composite def draw_tokens_locations(draw): tokens = draw(lists(draw_token_classified())) filename = draw(text()) located = [] line = 1 offset = 1 for t in tokens: location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) located.append(new) if t.value == "\n": line = line + 1 offset = 1 else: offset += len(t.value) return (tokens, located, filename) # Test that the tokenizer can determine locations # We expect the following behaviour: # - Only the token location is modified # - Each token's location filename is the generated filename # - A location's line is equal to 1 plus the number of tokens with the # value '\n' before the token # - A location's offset is equal to 1 plus the count of previous token's # value code points 'before' it, where 'before' is defined as any # token between the last '\n' (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_tokenize_locations(test_data): (input, located, filename) = test_data assert tokenize.locate_tokens(input, filename) == located # Draws a token and possibly add garbage # This is to ensure that tokens must completely match a value @composite def draw_token_classified_garbled(draw): token = draw(draw_token_classified()) value = token.value if draw(booleans()): value = draw(text(min_size=1)) + value if draw(booleans()): value = value + draw(text(min_size=1)) return Syntax(value, token.location, SyntaxType.TOKEN) # Draw a random string made of token values @composite def draw_source_fuzz(draw): tokens = draw(lists(draw_token_classified())) input = "" for t in tokens: input += t.value return input # Test that the tokenize function behaves as we expect # We expect the following behaviour: # - The tokenizer splits the tokens as expected, then # - The tokenizer sets the token locations as expected @given(draw_source_fuzz(), text()) def test_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) located = tokenize.locate_tokens(split, filename) tokenized = tokenize.tokenize(source, filename) assert located == tokenized