from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, ) from src import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + [ "StartText", "EndText", "StartNote", "EndNote", "True", "False", ] # Quick function to split a string by different characters, used for checking # if generated text includes tokens def split_by(string, characters): tokens = [] curr_token = "" for c in string: if c in characters: if curr_token != "": tokens.append(curr_token) curr_token = "" else: curr_token += c if curr_token != "": tokens.append(curr_token) return tokens # Wrapper to report ParseError nicely to Hypothesis def safe_tokenize(code, filename): tokens = [] try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) return tokens class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value def __repr__(self): return "SampleToken(code '%s', type '%s', value '%s')" % ( self.code, self.type, self.value, ) # Generates a Text token @composite def draw_token_text(draw): value = draw(text()) text_tokens = split_by(value, lexer_whitespace) assume("StartText" not in text_tokens and "EndText" not in text_tokens) space1 = draw(sampled_from(lexer_whitespace)) space2 = draw(sampled_from(lexer_whitespace)) code = "StartText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value.strip(lexer_whitespace)) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool == True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) assume(symbol not in reserved_words) # Reserved words aren't symbols assume(not symbol.startswith("#!")) # Shebangs aren't symbols return SampleToken(symbol, "symbol", symbol) # Generates any token @composite def draw_token(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), ] return draw(one_of(strategies)) # Test that we can lex tokens correctly @given(draw_token()) def test_lexer_token(token): tokens = safe_tokenize(token.code, "") assert tokens[0].type == token.type assert tokens[0].value == token.value assert tokens[0].location.line == 1 assert tokens[0].location.column == 1 assert tokens[0].location.file == "" assert tokens[1].type == "EOF" # Test that we can make notes using StartNote and EndNote syntax. @given(text(), sampled_from(lexer_whitespace), sampled_from(lexer_whitespace)) def test_lexer_note(text, space1, space2): note_tokens = split_by(text, lexer_whitespace) assume("StartNote" not in note_tokens and "EndNote" not in note_tokens) code = "StartNote" + space1 + text + space2 + "EndNote" tokens = safe_tokenize(code, "") assert tokens[0].type == "EOF" # Generates a reserved word @composite def draw_token_reserved(draw): word = draw(sampled_from(reserved_words)) return SampleToken(word, "reserved", word) # Generates a token made of conjoined tokens @composite def draw_token_conjoined(draw): strategies = [ draw_token_bool(), draw_token_reserved(), draw_token_symbol(), ] symbol = "" tokens = draw(lists(one_of(strategies), min_size=2)) for token in tokens: symbol += token.code assume(symbol not in reserved_words) # Reserved words aren't symbols return SampleToken(symbol, "symbol", symbol) # Test that reserved words aren't read without whitespace @given(draw_token_conjoined()) def test_lexer_conjoined_token(token): tokens = safe_tokenize(token.code, "") assert tokens[0].type == "symbol" assert tokens[0].value == token.value assert tokens[0].location.line == 1 assert tokens[0].location.column == 1 assert tokens[0].location.file == "" assert tokens[1].type == "EOF" # Test that shebangs are skipped @given(text(alphabet=characters(blacklist_characters="\n"))) def test_lexer_shebang(shebang): code = "#!" + shebang + "\n" tokens = safe_tokenize(code, "") assert tokens[0].type == "EOF" assert tokens[0].location.line == 2 assert tokens[0].location.column == 0 assert tokens[0].location.file == "" # Test with no data at all def test_parser_empty(): tokens = safe_tokenize("", "") assert tokens[0].type == "EOF" assert tokens[0].location.line == 1 assert tokens[0].location.column == 0 assert tokens[0].location.file == "" # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass