# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import given from hypothesis.strategies import ( booleans, composite, just, lists, one_of, text, ) from src import tokenize from src.syntax import Syntax, SyntaxLocation, SyntaxType from tests.test_syntax import ( draw_token_classified, draw_token_newline, draw_token_space, draw_token_unknown, valid_spaces, valid_newlines, ) # Draws a token using an existing strategy but with a blank location just like split_tokens outputs @composite def draw_token_splitted(draw, strategy): token = draw(strategy) location = SyntaxLocation(1, 1, "") return Syntax(token.value, location, SyntaxType.TOKEN) # Merges \r and \n tokens to \r\n tokens def merge_crlf(tokens): if len(tokens) < 2: return tokens prev = tokens[0] merged = [] for curr in tokens[1:]: if prev.value == "\r" and curr.value == "\n": # Don't append the \r # Set prev to \r\n instead of \n prev = Syntax("\r\n", prev.location, SyntaxType.TOKEN) else: merged.append(prev) prev = curr merged.append(prev) return merged # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite def draw_tokens_to_split(draw): source = "" tokens = [] elements = draw(lists(just(True))) drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) spaces = draw(lists(locationed, min_size=1)) tokens += merge_crlf(spaces) else: strategy = draw_token_unknown locationed = draw_token_splitted(strategy()) tokens.append(draw(locationed)) drawing_whitespace = not drawing_whitespace for t in tokens: source += t.value return (source, tokens) # Test that the tokenizer can split tokens properly # We expect the following behaviour: # - Whitespace is any of the following Unicode sequences: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to tokens # - Whitespace sequences are split in to multiple tokens # - Non-whitespace code points are combined in to a single token # - Each token location is line 1 offset 1 of file "" @given(draw_tokens_to_split()) def test_tokenize_split_tokens(test_data): (source, tokens) = test_data assert tokenize.split_tokens(source) == tokens # Generates a list of tokens and their corrected locations @composite def draw_tokens_locations(draw): tokens = draw(lists(draw_token_classified())) filename = draw(text()) located = [] line = 1 offset = 1 for t in tokens: location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) located.append(new) if t.value in valid_newlines: line = line + 1 offset = 1 else: offset += len(t.value) return (tokens, located, filename) # Test that the tokenizer can determine locations # We expect the following behaviour: # - New line tokens are tokens with one of the following Unicode sequences: # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Only the token location is modified # - Each token's location filename is the generated filename # - A token's line is equal to 1 plus the number of new line tokens # before the token # - A token's offset is equal to 1 plus the sum of previous token's # value lengths 'before' it, where 'before' is defined as any # token between the last new line token (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_tokenize_locations(test_data): (input, located, filename) = test_data assert tokenize.locate_tokens(input, filename) == located # Generates a list of tokens with and without whitespace @composite def draw_tokens_whitespace(draw): input = draw(lists(draw_token_classified())) syntax = [] for s in input: if s.type != SyntaxType.TOKEN or ( s.value not in valid_spaces and s.value not in valid_newlines ): syntax.append(s) return (input, syntax) # Test that the tokenizer can strip whitespace correctly # We expect the following behaviour: # - No syntax is modified # - Tokens with the following values are removed from the output: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR @given(draw_tokens_whitespace()) def test_tokenize_strip_whitespace(test_data): (input, syntax) = test_data assert tokenize.strip_whitespace(input) == syntax # Draws a token and possibly add garbage # This is to ensure that tokens must completely match a value @composite def draw_token_classified_garbled(draw): token = draw(draw_token_classified()) value = token.value if draw(booleans()): value = draw(text(min_size=1)) + value if draw(booleans()): value = value + draw(text(min_size=1)) return Syntax(value, token.location, SyntaxType.TOKEN) # Draw a random string made of token values @composite def draw_source_fuzz(draw): tokens = draw(lists(draw_token_classified())) input = "" for t in tokens: input += t.value return input # Test that the tokenize function behaves as we expect # We expect the following behaviour: # - The tokenizer splits the tokens as expected, then # - The tokenizer sets the token locations as expected @given(draw_source_fuzz(), text()) def test_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) located = tokenize.locate_tokens(split, filename) stripped = tokenize.strip_whitespace(located) tokenized = tokenize.tokenize(source, filename) assert stripped == tokenized