# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import given, assume from hypothesis.strategies import ( booleans, characters, composite, integers, just, lists, one_of, sampled_from, text, ) from src import tokenize # Keywords recognized by the language keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", "StartNote", "EndNote", "StartText", "EndText", ] # Draws a random token location @composite def draw_token_location(draw): line = draw(integers()) column = draw(integers()) filename = draw(text()) return tokenize.TokenLocation(line, column, filename) # Draws a random token type @composite def draw_token_type(draw): return draw(sampled_from(list(tokenize.TokenType))) # Draws a random token @composite def draw_token_random(draw): value = draw(text()) location = draw(draw_token_location()) type = draw(draw_token_type()) return tokenize.Token(value, location, type) # Draws an unknown token @composite def draw_token_unknown(draw): reserved = " \n\t" token = draw(draw_token_random()) chars = characters(blacklist_characters=reserved) value = draw(text(alphabet=chars, min_size=1)) assume(value not in ["True", "False"]) assume(value not in keywords) assume(value[0:2] != "#!") type = tokenize.TokenType.UNKNOWN return tokenize.Token(value, token.location, type) # Draws a space token @composite def draw_token_space(draw): space = " \t" token = draw(draw_token_random()) value = draw(sampled_from(space)) type = tokenize.TokenType.SPACE return tokenize.Token(value, token.location, type) # Draws a new line token @composite def draw_token_newline(draw): token = draw(draw_token_random()) value = "\n" type = tokenize.TokenType.NEWLINE return tokenize.Token(value, token.location, type) # Draws a bool token @composite def draw_token_bool(draw): token = draw(draw_token_random()) if draw(booleans()): value = "True" else: value = "False" type = tokenize.TokenType.BOOL return tokenize.Token(value, token.location, type) # Draws a keyword token @composite def draw_token_keyword(draw): token = draw(draw_token_random()) value = draw(sampled_from(keywords)) type = tokenize.TokenType.KEYWORD return tokenize.Token(value, token.location, type) # Draws a shebang token @composite def draw_token_shebang(draw): token = draw(draw_token_random()) value = "#!" + draw(text()) type = tokenize.TokenType.SHEBANG return tokenize.Token(value, token.location, type) # Draws a classified token @composite def draw_token_classified(draw): strategies = [ draw_token_unknown(), draw_token_space(), draw_token_newline(), draw_token_bool(), draw_token_keyword(), draw_token_shebang(), ] token = draw(one_of(strategies)) return token # Test location getters @given(integers(), integers(), text()) def test_tokenize_location_getters(line, column, filename): test = tokenize.TokenLocation(line, column, filename) assert test.line == line assert test.column == column assert test.file == filename # Test location equals @given(draw_token_location(), draw_token_location()) def test_tokenize_location_equality(location1, location2): equals = ( location1.line == location2.line and location1.column == location2.column and location1.file == location2.file ) assert (location1 == location2) == equals # Test token getters @given(text(), draw_token_location(), draw_token_type()) def test_tokenize_token_getters(value, location, type): test = tokenize.Token(value, location, type) assert test.value == value assert test.location == location assert test.type == type # Test token equals @given(draw_token_random(), draw_token_random()) def test_tokenize_token_equality(token1, token2): equals = ( token1.value == token2.value and token1.location == token2.location and token1.type == token2.type ) assert (token1 == token2) == equals # Draws a token using an existing strategy but with a blank location # and unknown type, just like split_tokens outputs @composite def draw_token_splitted(draw, strategy): token = draw(strategy) location = tokenize.TokenLocation(1, 1, "") type = tokenize.TokenType.UNKNOWN return tokenize.Token(token.value, location, type) # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite def draw_tokens_to_split(draw): source = "" tokens = [] elements = draw(lists(just(True))) drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) tokens += draw(lists(locationed, min_size=1)) else: strategy = draw_token_unknown locationed = draw_token_splitted(strategy()) tokens.append(draw(locationed)) drawing_whitespace = not drawing_whitespace for t in tokens: source += t.value return (source, tokens) # Test that the tokenizer can split tokens properly # We expect the following behaviour: # - Whitespace is a tab, space or new line # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to tokens # - Whitespace characters are split in to multiple tokens # - Non-whitespace characters are combined in to a single token # - Each token type is UNKNOWN # - Each token location is line 1 column 1 of file "" @given(draw_tokens_to_split()) def test_tokenize_split_tokens(test_data): (source, tokens) = test_data assert tokenize.split_tokens(source) == tokens # Generates a list of tokens and their corrected locations @composite def draw_tokens_locations(draw): tokens = draw(lists(draw_token_random())) filename = draw(text()) located = [] line = 1 column = 1 for t in tokens: location = tokenize.TokenLocation(line, column, filename) new = tokenize.Token(t.value, location, t.type) located.append(new) if t.type == tokenize.TokenType.NEWLINE: line = line + 1 column = 1 else: column += len(t.value) return (tokens, located, filename) # Test that the tokenizer can determine locations # We expect the following behaviour: # - Only the token location is modified # - Each token's location filename is the generated filename # - A token's line is equal to 1 plus the number of tokens with the # type NEWLINE before the token # - A token's column is equal to 1 plus the sum of previous token's # value lengths 'before' it, where 'before' is defined as any # token between the last NEWLINE (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_tokenize_locations(test_data): (input, located, filename) = test_data assert tokenize.locate_tokens(input, filename) == located # Draws a token and possibly adds garbage # This is to ensure that tokens must completely match a value @composite def draw_token_classified_garbled(draw): token = draw(draw_token_classified()) value = token.value type = token.type if draw(booleans()): value = draw(text(min_size=1)) + value type = tokenize.TokenType.UNKNOWN if draw(booleans()): value = value + draw(text(min_size=1)) type = tokenize.TokenType.UNKNOWN if value[0:2] == "#!": type = tokenize.TokenType.SHEBANG return tokenize.Token(value, token.location, type) # Generates a list of classified tokens and incorrectly classified tokens @composite def draw_tokens_classified(draw): tokens = draw(lists(draw_token_classified_garbled())) input = [] for t in tokens: type = draw(draw_token_type()) input.append(tokenize.Token(t.value, t.location, type)) return (input, tokens) # Test that classification can add types properly # We expect the following behaviour: # - Only the token type is modified # - The token's type is set based on which following condition its value meets: # - KEYWORD if it entirely matches a keyword # - BOOL if it is True or False # - WHITESPACE if it is a space or tab character # - NEWLINE if it is a new line character # - SHEBANG if it starts with a "#!" # - UNKNOWN if it is none of the above @given(draw_tokens_classified()) def test_tokenize_classification(test_data): (input, tokens) = test_data assert tokenize.classify_tokens(input) == tokens # Draw a random string made of token values @composite def draw_source_fuzz(draw): tokens = draw(lists(draw_token_classified())) input = "" for t in tokens: input += t.value return input # Test that the tokenize function behaves as we expect # We expect the following behaviour: # - The tokenizer splits the tokens as expected, then # - The tokenizer classifies tokens as expected, then # - The tokenizer sets the token locations as expected @given(draw_source_fuzz(), text()) def test_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) classified = tokenize.classify_tokens(split) located = tokenize.locate_tokens(classified, filename) tokenized = tokenize.tokenize(source, filename) assert located == tokenized