from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, ) from src import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + [ "StartText", "EndText", "StartNote", "EndNote", "True", "False", ] # Quick function to split a string by different characters, used for checking # if generated text includes tokens def split_by(string, characters): tokens = [] curr_token = "" for c in string: if c in characters: if curr_token != "": tokens.append(curr_token) curr_token = "" else: curr_token += c if curr_token != "": tokens.append(curr_token) return tokens # Wrapper to report ParseError nicely to Hypothesis def safe_tokenize(code, filename): tokens = [] try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) return tokens class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value def __repr__(self): return "SampleToken(code '%s', type '%s', value '%s')" % ( self.code, self.type, self.value, ) class SampleSoup: def __init__(self, tokens, code): self.tokens = tokens self.code = code def __repr__(self): return "SampleSoup(tokens %s, code '%s')" % ( self.tokens, self.code, ) # Generates a Text token @composite def draw_token_text(draw): value = draw(text()) text_tokens = split_by(value, lexer_whitespace) assume("StartText" not in text_tokens and "EndText" not in text_tokens) space1 = draw(sampled_from(lexer_whitespace)) space2 = draw(sampled_from(lexer_whitespace)) code = "StartText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value.strip(lexer_whitespace)) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool == True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) assume(symbol not in reserved_words) # Reserved words aren't symbols assume(not symbol.startswith("#!")) # Shebangs aren't symbols return SampleToken(symbol, "symbol", symbol) # Generates a note token @composite def draw_token_note(draw): value = draw(text()) note_tokens = split_by(value, lexer_whitespace) assume("StartNote" not in note_tokens and "EndNote" not in note_tokens) space1 = draw(sampled_from(lexer_whitespace)) space2 = draw(sampled_from(lexer_whitespace)) code = "StartNote" + space1 + value + space2 + "EndNote" return SampleToken(code, "note", None) # Generates a shebang token @composite def draw_token_shebang(draw): shebang = draw(text(alphabet=characters(blacklist_characters="\n"))) code = "#!" + shebang + "\n" return SampleToken(code, "shebang", None) # Generates an empty token @composite def draw_token_empty(draw): return SampleToken("", "empty", None) # Generates a soup of tokens @composite def draw_token_soup(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), draw_token_note(), draw_token_empty(), ] shebang = draw(lists(draw_token_shebang(), max_size=1)) tokens = draw(lists(one_of(strategies))) all_tokens = shebang + tokens code = "" for token in all_tokens: space = "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) code += token.code + space return SampleSoup(all_tokens, code) # Test that we can lex tokens correctly @given(draw_token_soup()) def test_lexer_soup(soup): tokens = safe_tokenize(soup.code, "") EOF = len(tokens) - 1 in_pos = 0 out_pos = 0 ignore_types = ["note", "shebang", "empty"] while out_pos < EOF and in_pos < len(soup.tokens): if soup.tokens[in_pos].type in ignore_types: in_pos += 1 else: assert tokens[out_pos].type == soup.tokens[in_pos].type assert tokens[out_pos].value == soup.tokens[in_pos].value assert tokens[out_pos].location.file == "" in_pos += 1 out_pos += 1 assert tokens[out_pos].type == "EOF" assert tokens[out_pos].location.file == "" # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass