# SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia <contact@jookia.org> from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, randoms, just, ) from newlang import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + [ "StartText", "EndText", "StartNote", "EndNote", "True", "False", ] # A sample token containing code to create a lexer token, and # the resulting lexer type and value # An type and value of 'None' is used for lexer code that # should get ignored such as notes class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value self.location = None def __repr__(self): return "SampleToken(code %s, type '%s', value %s, location %s)" % ( repr(self.code), self.type, repr(self.value), self.location, ) def __eq__(self, other): if other is None: return False return ( # Don't check code self.value == other.value and self.type == other.type and self.location.file == other.location.file and self.location.line == other.location.line and self.location.column == other.location.column ) # A soup of sample tokens class SampleSoup: def __init__(self, tokens, code, filename): self.tokens = tokens self.code = code self.filename = filename def __repr__(self): return "SampleSoup(tokens %s, code %s, filename '%s')" % ( self.tokens, repr(self.code), self.filename, ) # Draws a textual identifier consisting of random characters and reserved words @composite def draw_identifier(draw): identifiers = draw( lists( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) ) # If we have no identifiers, draw at least two words so we don't accidentally # draw a reserved word alone. min_words = 2 if len(identifiers) == 0 else 0 words = draw(lists(sampled_from(reserved_words), min_size=min_words)) all_words = identifiers + words draw(randoms()).shuffle(all_words) value = "".join(all_words) assume(value not in reserved_words) # Reserved words aren't symbols return value # Draws whitespace ignored by the lexer @composite def draw_whitespace(draw): return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) # Draws a list of words separated by whitespace @composite def draw_joined_words(draw, words): output = "" for word in words[0:1]: # No space before the first word output += word for word in words[1:]: space = draw(draw_whitespace()) output += space + word return output # Draws zero or more words made of identifiers and reserved words @composite def draw_word_salad(draw, exclude_words): reserved = reserved_words.copy() for exclude in exclude_words: reserved.remove(exclude) strategies = [ draw_identifier(), sampled_from(reserved), ] words = draw(lists(one_of(strategies))) return draw(draw_joined_words(words)) # Generates a Text token @composite def draw_token_text(draw): value = draw(draw_word_salad(["StartText", "EndText"])) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value.strip(lexer_whitespace)) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool is True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw(draw_identifier()) return SampleToken(symbol, "symbol", symbol) # Generates a note token @composite def draw_token_note(draw): value = draw(draw_word_salad(["StartNote", "EndNote"])) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartNote" + space1 + value + space2 + "EndNote" return SampleToken(code, None, None) # Generates an empty token def draw_token_empty(): return just(SampleToken("", None, None)) # Generates a set of valid tokens @composite def draw_tokens_valid(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), draw_token_note(), draw_token_empty(), ] tokens = draw(lists(one_of(strategies))) return tokens # Generates a soup of tokens using a given strategy @composite def draw_token_soup(draw, all_tokens): filename = draw(text()) code = "" curr_line = 1 curr_column = 1 for token in all_tokens: space = draw(draw_whitespace()) new_code = token.code + space lines = new_code.split("\n") code += new_code token.location = parse.ParseLocation(curr_line, curr_column, filename) curr_line += len(lines) - 1 if len(lines) > 1: curr_column = len(lines[-1]) + 1 else: curr_column += len(new_code) eof = SampleToken(None, "EOF", None) eof.location = parse.ParseLocation(curr_line, curr_column - 1, filename) return SampleSoup(all_tokens + [eof], code, filename) # Generates a soup of valid tokens @composite def draw_soup_valid(draw): tokens = draw(draw_tokens_valid()) soup = draw(draw_token_soup(tokens)) return soup # Test that we can lex tokens correctly @given(draw_soup_valid()) def test_lexer_valid(soup): try: tokenizer = parse.Tokenizer(soup.code, soup.filename) tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) assert len(tokens) <= len(soup.tokens) in_pos = 0 out_pos = 0 while in_pos < len(soup.tokens): if soup.tokens[in_pos].type: assert tokens[out_pos] == soup.tokens[in_pos] out_pos += 1 in_pos += 1 assert in_pos == len(soup.tokens) assert out_pos == len(tokens) # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass