# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import assume, given from hypothesis.strategies import ( booleans, characters, composite, just, lists, one_of, sampled_from, text, ) from newlang.parse2 import tokenize from newlang.parse2.token import Token, TokenLocation from tests.parse2.test_token import static_token_location # Values considered spaces valid_spaces = [ "\t", # U+0009 HORIZONTAL TAB " ", # U+0020 SPACE ] # Single values reserved for new line use single_newlines = [ "\n", # U+000A LINE FEED "\v", # U+000B VERTICAL TAB "\f", # U+000C FORM FEED "\r", # U+000D CARRIAGE RETURN "\u0085", # U+0085 NEXT LINE "\u2028", # U+2028 LINE SEPARATOR "\u2029", # U+2029 PARAGRAPH SEPARATOR ] # Multi values reserved for new line use multi_newlines = [ "\r\n", # U+000A U+000D CARRIAGE RETURN then LINE FEED ] # All values reserved for new line use valid_newlines = single_newlines + multi_newlines # Draws a space token @composite def draw_token_space(draw): location = static_token_location() value = draw(sampled_from(valid_spaces)) return Token(value, location) # Draws a new line token @composite def draw_token_newline(draw): location = static_token_location() value = draw(sampled_from(valid_newlines)) return Token(value, location) # Draws a random token without whitespace @composite def draw_token_nospace(draw): reserved = valid_spaces + single_newlines location = static_token_location() chars = characters(blacklist_characters=reserved) value = draw(text(alphabet=chars, min_size=1)) for v in multi_newlines: assume(v not in value) return Token(value, location) # Draws a random token perhaps with whitespaces @composite def draw_token_maybespace(draw): strategies = [ draw_token_space(), draw_token_newline(), draw_token_nospace(), ] return draw(one_of(strategies)) # Draws a token using an existing strategy but with a blank location just like split_tokens outputs @composite def draw_token_splitted(draw, strategy): token = draw(strategy) location = TokenLocation(1, 1, "") return Token(token.value, location) # Merges \r and \n tokens to \r\n tokens def merge_crlf(tokens): if len(tokens) < 2: return tokens prev = tokens[0] merged = [] for curr in tokens[1:]: if prev.value == "\r" and curr.value == "\n": # Previous token is \r, don't append it # Instead promote this \n token to \r\n prev = Token("\r\n", prev.location) else: # Append the previous token merged.append(prev) prev = curr merged.append(prev) return merged # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite def draw_tokens_to_split(draw): source = "" tokens = [] elements = draw(lists(just(True))) # Dummy list for sizing drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) spaces = draw(lists(locationed, min_size=1)) tokens += merge_crlf(spaces) else: locationed = draw_token_splitted(draw_token_nospace()) tokens.append(draw(locationed)) drawing_whitespace = not drawing_whitespace for t in tokens: source += t.value return (source, tokens) # Test that the tokenizer can split tokens properly # We expect the following behaviour: # - Whitespace is any of the following Unicode sequences: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to separate tokens # - Whitespace sequences are split in to multiple adjacent tokens # - Non-whitespace code points are combined in to a single token # - Each token location is line 1 offset 1 of file "" @given(draw_tokens_to_split()) def test_parse2_tokenize_split_tokens(test_data): (source, tokens) = test_data assert tokenize.split_tokens(source) == tokens # Generates a list of tokens with correct locations @composite def draw_tokens_locations(draw): tokens = draw(lists(draw_token_maybespace())) filename = draw(text()) located = [] line = 1 offset = 1 for t in tokens: location = TokenLocation(line, offset, filename) new = Token(t.value, location) located.append(new) if t.value in valid_newlines: line = line + 1 offset = 1 else: offset += len(t.value) return (tokens, located, filename) # Test that the tokenizer can determine locations # We expect the following behaviour: # - New line tokens are tokens with one of the following Unicode sequences: # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR # - Only the token location is modified # - Each token's location filename is the generated filename # - A token's line is equal to 1 plus the number of new line tokens # before the token # - A token's offset is equal to 1 plus the sum of previous token's # value lengths 'before' it, where 'before' is defined as any # token between the last new line token (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_parse2_tokenize_locations(test_data): (input, located, filename) = test_data assert tokenize.locate_tokens(input, filename) == located # Generates two list of tokens: One with whitespace and one without @composite def draw_tokens_whitespace(draw): input = draw(lists(draw_token_maybespace())) stripped = [] for t in input: is_whitespace = t.value in valid_spaces or t.value in valid_newlines if not is_whitespace: stripped.append(t) return (input, stripped) # Test that the tokenizer can strip whitespace correctly # We expect the following behaviour: # - No tokens are modified # - Tokens with the following values are removed from the output: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN # U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR @given(draw_tokens_whitespace()) def test_parse2_tokenize_strip_whitespace(test_data): (input, tokens) = test_data assert tokenize.strip_whitespace(input) == tokens # Draw a random string made of token values @composite def draw_source_fuzz(draw): tokens = draw(lists(draw_token_maybespace())) input = "" for t in tokens: input += t.value return input # Test that the tokenize function behaves as we expect # We expect the following behaviour: # - The tokenizer splits the tokens as expected, then # - The tokenizer sets the token locations as expected @given(draw_source_fuzz(), text()) def test_parse2_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) located = tokenize.locate_tokens(split, filename) stripped = tokenize.strip_whitespace(located) tokenized = tokenize.tokenize(source, filename) assert stripped == tokenized