from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, ) from src import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + ["StartText", "EndText", "StartNote", "EndNote"] # Quick function to split a string by different characters, used for checking # if generated text includes tokens def split_by(string, characters): tokens = [] curr_token = "" for c in string: if c in characters: if curr_token != "": tokens.append(curr_token) curr_token = "" else: curr_token += c if curr_token != "": tokens.append(curr_token) return tokens class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value def __repr__(self): return "SampleToken(code '%s', type '%s', value '%s')" % ( self.code, self.type, self.value, ) # Generates a Text token @composite def draw_token_text(draw): value = draw(text()) text_tokens = split_by(value, lexer_whitespace) assume("BeginText" not in text_tokens and "EndText" not in text_tokens) space1 = draw(sampled_from(lexer_whitespace)) space2 = draw(sampled_from(lexer_whitespace)) code = "BeginText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool == True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) assume(symbol not in reserved_words) # Reserved words aren't symbols assume(not symbol.startswith("#!")) # Shebangs aren't symbols return SampleToken(symbol, "symbol", symbol) # Generates any token @composite def draw_token(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), ] return draw(one_of(strategies)) # Test that we can lex tokens correctly @given(draw_token()) def test_lexer_token(token): tokens = [] try: tokenizer = parse.Tokenizer(token.code, "") tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) assert tokens[0].type == token.type assert tokens[0].value == token.value assert tokens[0].location.line == 1 assert tokens[0].location.column == 1 assert tokens[0].location.file == "" assert tokens[1].type == "EOF" # Test that we can make notes using BeginNote and EndNote syntax. @given(text(), sampled_from(lexer_whitespace), sampled_from(lexer_whitespace)) def test_lexer_note(text, space1, space2): text_tokens = split_by(text, lexer_whitespace) assume("BeginNote" not in text_tokens and "EndNote" not in text_tokens) code = "BeginNote" + space1 + text + space2 + "EndNote" tokenizer = parse.Tokenizer(code, "") tokens = tokenizer.tokenize() assert tokens[0].type == "EOF" # Test that reserved words aren't read without whitespace @given(lists(sampled_from(reserved_words), min_size=2)) def test_lexer_conjoined_words(words): word = "".join(words) tokenizer = parse.Tokenizer(word, "") tokens = tokenizer.tokenize() assert tokens[0].type == "symbol" assert tokens[0].value == word assert tokens[0].location.line == 1 assert tokens[0].location.column == 1 assert tokens[0].location.file == "" assert tokens[1].type == "EOF" # Test that shebangs are skipped @given(text(alphabet=characters(blacklist_characters="\n"))) def test_lexer_shebang(shebang): code = "#!" + shebang + "\n" tokenizer = parse.Tokenizer(code, "") tokens = tokenizer.tokenize() assert tokens[0].type == "EOF" assert tokens[0].location.line == 2 assert tokens[0].location.column == 0 assert tokens[0].location.file == "" # Test with no data at all def test_parser_empty(): tokenizer = parse.Tokenizer("", "") tokens = tokenizer.tokenize() assert tokens[0].type == "EOF" assert tokens[0].location.line == 1 assert tokens[0].location.column == 0 assert tokens[0].location.file == "" # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass