from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, randoms, ) from src import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + [ "StartText", "EndText", "StartNote", "EndNote", "True", "False", ] # Quick function to split a string by different characters, used for checking # if generated text includes tokens def split_by(string, characters): tokens = [] curr_token = "" for c in string: if c in characters: if curr_token != "": tokens.append(curr_token) curr_token = "" else: curr_token += c if curr_token != "": tokens.append(curr_token) return tokens # Wrapper to report ParseError nicely to Hypothesis def safe_tokenize(code, filename): tokens = [] try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) return tokens # A sample token containing code to create a lexer token, and # the resulting lexer type and value # An type and value of 'None' is used for lexer code that # should get ignored, such as shebangs and notes class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value def __repr__(self): return "SampleToken(code '%s', type '%s', value '%s')" % ( self.code, self.type, self.value, ) # A soup of sample tokens class SampleSoup: def __init__(self, tokens, code): self.tokens = tokens self.code = code def __repr__(self): return "SampleSoup(tokens %s, code '%s')" % ( self.tokens, self.code, ) # Draws a textual identifier consisting of random characters and reserved words @composite def draw_identifier(draw): identifiers = draw( lists( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) ) # If we have no identifiers, draw at least two words so we don't accidentally # draw a reserved word alone. min_words = 2 if len(identifiers) == 0 else 0 words = draw(lists(sampled_from(reserved_words), min_size=min_words)) all_words = identifiers + words draw(randoms()).shuffle(all_words) value = "".join(all_words) assume(value not in reserved_words) # Reserved words aren't symbols assume(not value.startswith("#!")) # Shebangs aren't symbols return value # Draws whitespace ignored by the lexer @composite def draw_whitespace(draw): return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) # Generates a Text token @composite def draw_token_text(draw): value = draw(text()) text_tokens = split_by(value, lexer_whitespace) assume("StartText" not in text_tokens and "EndText" not in text_tokens) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value.strip(lexer_whitespace)) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool == True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw(draw_identifier()) return SampleToken(symbol, "symbol", symbol) # Generates a note token @composite def draw_token_note(draw): value = draw(text()) note_tokens = split_by(value, lexer_whitespace) assume("StartNote" not in note_tokens and "EndNote" not in note_tokens) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartNote" + space1 + value + space2 + "EndNote" return SampleToken(code, None, None) # Generates a shebang token @composite def draw_token_shebang(draw): shebang = draw(text(alphabet=characters(blacklist_characters="\n"))) code = "#!" + shebang + "\n" return SampleToken(code, None, None) # Generates an empty token @composite def draw_token_empty(draw): return SampleToken("", None, None) # Generates a soup of tokens @composite def draw_token_soup(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), draw_token_note(), draw_token_empty(), ] shebang = draw(lists(draw_token_shebang(), max_size=1)) tokens = draw(lists(one_of(strategies))) all_tokens = shebang + tokens code = "" for token in all_tokens: space = draw(draw_whitespace()) code += token.code + space return SampleSoup(all_tokens, code) # Test that we can lex tokens correctly @given(draw_token_soup()) def test_lexer_soup(soup): tokens = safe_tokenize(soup.code, "") EOF = len(tokens) - 1 in_pos = 0 out_pos = 0 while out_pos < EOF and in_pos < len(soup.tokens): if not soup.tokens[in_pos].type: in_pos += 1 else: assert tokens[out_pos].type == soup.tokens[in_pos].type assert tokens[out_pos].value == soup.tokens[in_pos].value assert tokens[out_pos].location.file == "" in_pos += 1 out_pos += 1 assert tokens[out_pos].type == "EOF" assert tokens[out_pos].location.file == "" # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass