from hypothesis import given, assume from hypothesis.strategies import ( text, booleans, sampled_from, one_of, characters, lists, composite, randoms, integers, ) from src import parse # Whitespace that separates lexer words lexer_whitespace = "\n\t " # List of keywords the lexer understands keywords = [ "NewLang", "Done", "Set", "To", "EndSet", "If", "Then", "Else", "EndIf", ] # List of words the lexer understands reserved_words = keywords + [ "StartText", "EndText", "StartNote", "EndNote", "True", "False", ] # Splits a string in to tokens and their separators def split_tokens(text, split_by): output = [] curr_token = "" if text == "": return output curr_state = text[0] in split_by for c in text: new_state = c in split_by if curr_state != new_state: curr_state = new_state output.append(curr_token) curr_token = "" curr_token += c if curr_token != "": output.append(curr_token) return output # A sample token containing code to create a lexer token, and # the resulting lexer type and value # An type and value of 'None' is used for lexer code that # should get ignored, such as shebangs and notes class SampleToken: def __init__(self, code, type, value): self.code = code self.type = type self.value = value self.location = None def __repr__(self): return "SampleToken(code %s, type '%s', value %s, location %s)" % ( repr(self.code), self.type, repr(self.value), self.location, ) def __eq__(self, other): return ( # Don't check code self.value == other.value and self.type == other.type and self.location.file == other.location.file and self.location.line == other.location.line and self.location.column == other.location.column ) # A soup of sample tokens class SampleSoup: def __init__(self, tokens, code, filename): self.tokens = tokens self.code = code self.filename = filename def __repr__(self): return "SampleSoup(tokens %s, code %s, filename '%s')" % ( self.tokens, repr(self.code), self.filename, ) # Draws a textual identifier consisting of random characters and reserved words @composite def draw_identifier(draw): identifiers = draw( lists( text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) ) ) # If we have no identifiers, draw at least two words so we don't accidentally # draw a reserved word alone. min_words = 2 if len(identifiers) == 0 else 0 words = draw(lists(sampled_from(reserved_words), min_size=min_words)) all_words = identifiers + words draw(randoms()).shuffle(all_words) value = "".join(all_words) assume(value not in reserved_words) # Reserved words aren't symbols assume(not value.startswith("#!")) # Shebangs aren't symbols return value # Draws whitespace ignored by the lexer @composite def draw_whitespace(draw): return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) # Draws a list of words separated by whitespace @composite def draw_joined_words(draw, words): output = "" for word in words[0:1]: # No space before the first word output += word for word in words[1:]: space = draw(draw_whitespace()) output += space + word return output # Draws zero or more words made of identifiers and reserved words @composite def draw_word_salad(draw, exclude_words): reserved = reserved_words.copy() for exclude in exclude_words: reserved.remove(exclude) strategies = [ draw_identifier(), sampled_from(reserved), ] words = draw(lists(one_of(strategies))) return draw(draw_joined_words(words)) # Generates a Text token @composite def draw_token_text(draw): value = draw(draw_word_salad(["StartText", "EndText"])) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartText" + space1 + value + space2 + "EndText" return SampleToken(code, "text", value.strip(lexer_whitespace)) # Generates a Bool token @composite def draw_token_bool(draw): bool = draw(booleans()) if bool is True: code = "True" else: code = "False" return SampleToken(code, "bool", bool) # Generates a keyword token @composite def draw_token_keyword(draw): keyword = draw(sampled_from(keywords)) return SampleToken(keyword, "keyword", keyword) # Generates a symbol token @composite def draw_token_symbol(draw): symbol = draw(draw_identifier()) return SampleToken(symbol, "symbol", symbol) # Generates a note token @composite def draw_token_note(draw): value = draw(draw_word_salad(["StartNote", "EndNote"])) space1 = draw(draw_whitespace()) space2 = draw(draw_whitespace()) code = "StartNote" + space1 + value + space2 + "EndNote" return SampleToken(code, None, None) # Generates a shebang token @composite def draw_token_shebang(draw): shebang = draw(draw_word_salad([])).replace("\n", "") code = "#!" + shebang + "\n" return SampleToken(code, None, None) # Generates an empty token @composite def draw_token_empty(draw): return SampleToken("", None, None) # Generates a set of valid tokens @composite def draw_tokens_valid(draw): strategies = [ draw_token_text(), draw_token_bool(), draw_token_keyword(), draw_token_symbol(), draw_token_note(), draw_token_empty(), ] shebang = draw(lists(draw_token_shebang(), max_size=1)) tokens = draw(lists(one_of(strategies))) all_tokens = shebang + tokens return all_tokens # Generates a soup of tokens using a given strategy @composite def draw_token_soup(draw, all_tokens): filename = draw(text()) code = "" curr_line = 1 curr_column = 1 for token in all_tokens: space = draw(draw_whitespace()) new_code = token.code + space lines = new_code.split("\n") code += new_code token.location = parse.ParseLocation(curr_line, curr_column, filename) curr_line += len(lines) - 1 if len(lines) > 1: curr_column = len(lines[-1]) + 1 else: curr_column += len(new_code) eof = SampleToken(None, "EOF", None) eof.location = parse.ParseLocation(curr_line, curr_column - 1, filename) return SampleSoup(all_tokens + [eof], code, filename) # Generates a soup of valid tokens @composite def draw_soup_valid(draw): tokens = draw(draw_tokens_valid()) soup = draw(draw_token_soup(tokens)) return soup # Generates a soup with a stray EndText @composite def draw_error_endtext_stray(draw): tokens = draw(draw_tokens_valid()) bad_token = SampleToken("EndText", None, None) index = draw(integers(min_value=0, max_value=len(tokens))) tokens.insert(index, bad_token) soup = draw(draw_token_soup(tokens)) error_token = soup.tokens[index] return (error_token, soup) # Generates a soup with a modified Text without EndText # Only the latest Text in the soup is modified @composite def draw_error_endtext_remove(draw): tokens = draw(draw_tokens_valid()) text_index = None for i in range(0, len(tokens)): if tokens[i].type == "text": # Find the last Text so when we remove # and EndText the next Text won't close it text_index = i if text_index is None: token = draw(draw_token_text()) min = 0 max = len(tokens) # Don't put a text before a shebang if max > 0 and tokens[0].code[0:2] == "#!": min = 1 text_index = draw(integers(min_value=min, max_value=max)) tokens.insert(text_index, token) text_token = tokens[text_index] lex_tokens = split_tokens(text_token.code, lexer_whitespace) new_code = "".join(lex_tokens[:-1]) error_token = SampleToken(new_code, None, None) tokens[text_index] = error_token soup = draw(draw_token_soup(tokens)) return (error_token, soup) # Test that we can lex tokens correctly @given(draw_soup_valid()) def test_lexer_valid(soup): try: tokenizer = parse.Tokenizer(soup.code, soup.filename) tokens = tokenizer.tokenize() except parse.ParseError as e: raise AssertionError("ParseError thrown: %s" % (e)) assert len(tokens) <= len(soup.tokens) in_pos = 0 out_pos = 0 while in_pos < len(soup.tokens): if soup.tokens[in_pos].type: assert tokens[out_pos] == soup.tokens[in_pos] out_pos += 1 in_pos += 1 assert in_pos == len(soup.tokens) assert out_pos == len(tokens) # Test that we can catch a stray EndText @given(draw_error_endtext_stray()) def test_lexer_error_endtext_stray(error_soup): (error_token, soup) = error_soup try: tokenizer = parse.Tokenizer(soup.code, soup.filename) tokens = tokenizer.tokenize() assert False # Success? except parse.ParseError as e: location = error_token.location assert e.context.parent == None assert e.context.context == "reading word" assert e.context.location == location assert e.error == "Found stray EndText" # Test that removing an EndText from the last available text # pair causes some error, either from the text data being read as # code or from the text being unterminated @given(draw_error_endtext_remove()) def test_lexer_error_endtext_remove(error_soup): (error_token, soup) = error_soup try: tokenizer = parse.Tokenizer(soup.code, soup.filename) tokens = tokenizer.tokenize() assert False # Success? except parse.ParseError as e: location = error_token.location assert e.context.parent == None # e.context.context will be random assert e.context.location.line >= location.line if e.context.location.line == location.line: assert e.context.location.column >= location.column # e.error will be random # General fuzz test, make sure the parser doesn't fall apart and spew # uncontrolled errors. @given(text(), text()) def test_parser_fuzz(code, filename): try: tokenizer = parse.Tokenizer(code, filename) tokens = tokenizer.tokenize() parser = parse.Parser(tokens) parser.parse_file() except parse.ParseError: pass