# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import given from hypothesis.strategies import ( booleans, composite, just, lists, one_of, text, ) from src import tokenize from src.token import Token, TokenLocation from tests.test_token import ( draw_token_random, draw_token_newline, draw_token_space, draw_token_unknown, valid_spaces, valid_newlines, ) # Draws a token using an existing strategy but with a blank location just like split_tokens outputs @composite def draw_token_splitted(draw, strategy): token = draw(strategy) location = TokenLocation(1, 1, "") return Token(token.value, location) # Merges \r and \n tokens to \r\n tokens def merge_crlf(tokens): if len(tokens) < 2: return tokens prev = tokens[0] merged = [] for curr in tokens[1:]: if prev.value == "\r" and curr.value == "\n": # Previous token is \r, don't append it # Instead promote this \n token to \r\n prev = Token("\r\n", prev.location) else: # Append the previous token merged.append(prev) prev = curr merged.append(prev) return merged # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite def draw_tokens_to_split(draw): source = "" tokens = [] elements = draw(lists(just(True))) # Dummy list for sizing drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) spaces = draw(lists(locationed, min_size=1)) tokens += merge_crlf(spaces) else: strategy = draw_token_unknown locationed = draw_token_splitted(strategy()) tokens.append(draw(locationed)) drawing_whitespace = not drawing_whitespace for t in tokens: source += t.value return (source, tokens) # Test that the tokenizer can split tokens properly # We expect the following behaviour: # - Whitespace is any of the following Unicode sequences: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to separate tokens # - Whitespace sequences are split in to multiple adjacent tokens # - Non-whitespace code points are combined in to a single token # - Each token location is line 1 offset 1 of file "" @given(draw_tokens_to_split()) def test_tokenize_split_tokens(test_data): (source, tokens) = test_data assert tokenize.split_tokens(source) == tokens # Generates a list of tokens with correct locations @composite def draw_tokens_locations(draw): tokens = draw(lists(draw_token_random())) filename = draw(text()) located = [] line = 1 offset = 1 for t in tokens: location = TokenLocation(line, offset, filename) new = Token(t.value, location) located.append(new) if t.value in valid_newlines: line = line + 1 offset = 1 else: offset += len(t.value) return (tokens, located, filename) # Test that the tokenizer can determine locations # We expect the following behaviour: # - New line tokens are tokens with one of the following Unicode sequences: # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Only the token location is modified # - Each token's location filename is the generated filename # - A token's line is equal to 1 plus the number of new line tokens # before the token # - A token's offset is equal to 1 plus the sum of previous token's # value lengths 'before' it, where 'before' is defined as any # token between the last new line token (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_tokenize_locations(test_data): (input, located, filename) = test_data assert tokenize.locate_tokens(input, filename) == located # Generates two list of tokens: One with whitespace and one without @composite def draw_tokens_whitespace(draw): input = draw(lists(draw_token_random())) stripped = [] for t in input: is_whitespace = t.value in valid_spaces or t.value in valid_newlines if not is_whitespace: stripped.append(t) return (input, stripped) # Test that the tokenizer can strip whitespace correctly # We expect the following behaviour: # - No tokens are modified # - Tokens with the following values are removed from the output: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR @given(draw_tokens_whitespace()) def test_tokenize_strip_whitespace(test_data): (input, tokens) = test_data assert tokenize.strip_whitespace(input) == tokens # Draws a token and possibly add garbage # This is to ensure that tokens must completely match a value @composite def draw_token_random_garbled(draw): token = draw(draw_token_random()) value = token.value if draw(booleans()): value = draw(text(min_size=1)) + value if draw(booleans()): value = value + draw(text(min_size=1)) return Token(value, token.location) # Draw a random string made of token values @composite def draw_source_fuzz(draw): tokens = draw(lists(draw_token_random())) input = "" for t in tokens: input += t.value return input # Test that the tokenize function behaves as we expect # We expect the following behaviour: # - The tokenizer splits the tokens as expected, then # - The tokenizer sets the token locations as expected @given(draw_source_fuzz(), text()) def test_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) located = tokenize.locate_tokens(split, filename) stripped = tokenize.strip_whitespace(located) tokenized = tokenize.tokenize(source, filename) assert stripped == tokenized