NewLang/tests/test_parse.py at b3b0d041ec1870205cf7dc318d37c3f5edacba38

Fork: 0
LuminaSensum / NewLang
Find file
Newer
Older
NewLang / tests / test_parse.py
Jookia on 18 Jan 2022 10 KB tests: Clarify purpose of draw_error_endtext_remove
Raw Blame History
from hypothesis import given, assume
from hypothesis.strategies import (
    text,
    booleans,
    sampled_from,
    one_of,
    characters,
    lists,
    composite,
    randoms,
    integers,
)

from src import parse

# Whitespace that separates lexer words
lexer_whitespace = "\n\t "

# List of keywords the lexer understands
keywords = [
    "NewLang",
    "Done",
    "Set",
    "To",
    "EndSet",
    "If",
    "Then",
    "Else",
    "EndIf",
]


# List of words the lexer understands
reserved_words = keywords + [
    "StartText",
    "EndText",
    "StartNote",
    "EndNote",
    "True",
    "False",
]


# Splits a string in to tokens and their separators
def split_tokens(text, split_by):
    output = []
    curr_token = ""
    if text == "":
        return output
    curr_state = text[0] in split_by
    for c in text:
        new_state = c in split_by
        if curr_state != new_state:
            curr_state = new_state
            output.append(curr_token)
            curr_token = ""
        curr_token += c
    if curr_token != "":
        output.append(curr_token)
    return output


# A sample token containing code to create a lexer token, and
# the resulting lexer type and value
# An type and value of 'None' is used for lexer code that
# should get ignored, such as shebangs and notes
class SampleToken:
    def __init__(self, code, type, value):
        self.code = code
        self.type = type
        self.value = value
        self.location = None

    def __repr__(self):
        return "SampleToken(code %s, type '%s', value %s, location %s)" % (
            repr(self.code),
            self.type,
            repr(self.value),
            self.location,
        )

    def __eq__(self, other):
        return (
            # Don't check code
            self.value == other.value
            and self.type == other.type
            and self.location.file == other.location.file
            and self.location.line == other.location.line
            and self.location.column == other.location.column
        )


# A soup of sample tokens
class SampleSoup:
    def __init__(self, tokens, code, filename):
        self.tokens = tokens
        self.code = code
        self.filename = filename

    def __repr__(self):
        return "SampleSoup(tokens %s, code %s, filename '%s')" % (
            self.tokens,
            repr(self.code),
            self.filename,
        )


# Draws a textual identifier consisting of random characters and reserved words
@composite
def draw_identifier(draw):
    identifiers = draw(
        lists(
            text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1)
        )
    )
    # If we have no identifiers, draw at least two words so we don't accidentally
    # draw a reserved word alone.
    min_words = 2 if len(identifiers) == 0 else 0
    words = draw(lists(sampled_from(reserved_words), min_size=min_words))
    all_words = identifiers + words
    draw(randoms()).shuffle(all_words)
    value = "".join(all_words)
    assume(value not in reserved_words)  # Reserved words aren't symbols
    assume(not value.startswith("#!"))  # Shebangs aren't symbols
    return value


# Draws whitespace ignored by the lexer
@composite
def draw_whitespace(draw):
    return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1)))


# Draws a list of words separated by whitespace
@composite
def draw_joined_words(draw, words):
    output = ""
    for word in words[0:1]:
        # No space before the first word
        output += word
    for word in words[1:]:
        space = draw(draw_whitespace())
        output += space + word
    return output


# Draws zero or more words made of identifiers and reserved words
@composite
def draw_word_salad(draw, exclude_words):
    reserved = reserved_words.copy()
    for exclude in exclude_words:
        reserved.remove(exclude)
    strategies = [
        draw_identifier(),
        sampled_from(reserved),
    ]
    words = draw(lists(one_of(strategies)))
    return draw(draw_joined_words(words))


# Generates a Text token
@composite
def draw_token_text(draw):
    value = draw(draw_word_salad(["StartText", "EndText"]))
    space1 = draw(draw_whitespace())
    space2 = draw(draw_whitespace())
    code = "StartText" + space1 + value + space2 + "EndText"
    return SampleToken(code, "text", value.strip(lexer_whitespace))


# Generates a Bool token
@composite
def draw_token_bool(draw):
    bool = draw(booleans())
    if bool is True:
        code = "True"
    else:
        code = "False"
    return SampleToken(code, "bool", bool)


# Generates a keyword token
@composite
def draw_token_keyword(draw):
    keyword = draw(sampled_from(keywords))
    return SampleToken(keyword, "keyword", keyword)


# Generates a symbol token
@composite
def draw_token_symbol(draw):
    symbol = draw(draw_identifier())
    return SampleToken(symbol, "symbol", symbol)


# Generates a note token
@composite
def draw_token_note(draw):
    value = draw(draw_word_salad(["StartNote", "EndNote"]))
    space1 = draw(draw_whitespace())
    space2 = draw(draw_whitespace())
    code = "StartNote" + space1 + value + space2 + "EndNote"
    return SampleToken(code, None, None)


# Generates a shebang token
@composite
def draw_token_shebang(draw):
    shebang = draw(draw_word_salad([])).replace("\n", "")
    code = "#!" + shebang + "\n"
    return SampleToken(code, None, None)


# Generates an empty token
@composite
def draw_token_empty(draw):
    return SampleToken("", None, None)


# Generates a set of valid tokens
@composite
def draw_tokens_valid(draw):
    strategies = [
        draw_token_text(),
        draw_token_bool(),
        draw_token_keyword(),
        draw_token_symbol(),
        draw_token_note(),
        draw_token_empty(),
    ]
    shebang = draw(lists(draw_token_shebang(), max_size=1))
    tokens = draw(lists(one_of(strategies)))
    all_tokens = shebang + tokens
    return all_tokens


# Generates a soup of tokens using a given strategy
@composite
def draw_token_soup(draw, all_tokens):
    filename = draw(text())
    code = ""
    curr_line = 1
    curr_column = 1
    for token in all_tokens:
        space = draw(draw_whitespace())
        new_code = token.code + space
        lines = new_code.split("\n")
        code += new_code
        token.location = parse.ParseLocation(curr_line, curr_column, filename)
        curr_line += len(lines) - 1
        if len(lines) > 1:
            curr_column = len(lines[-1]) + 1
        else:
            curr_column += len(new_code)
    eof = SampleToken(None, "EOF", None)
    eof.location = parse.ParseLocation(curr_line, curr_column - 1, filename)
    return SampleSoup(all_tokens + [eof], code, filename)


# Generates a soup of valid tokens
@composite
def draw_soup_valid(draw):
    tokens = draw(draw_tokens_valid())
    soup = draw(draw_token_soup(tokens))
    return soup


# Generates a soup with a stray EndText
@composite
def draw_error_endtext_stray(draw):
    tokens = draw(draw_tokens_valid())
    bad_token = SampleToken("EndText", None, None)
    index = draw(integers(min_value=0, max_value=len(tokens)))
    tokens.insert(index, bad_token)
    soup = draw(draw_token_soup(tokens))
    error_token = soup.tokens[index]
    return (error_token, soup)


# Generates a soup with a modified Text without EndText
# Only the latest Text in the soup is modified
@composite
def draw_error_endtext_remove(draw):
    tokens = draw(draw_tokens_valid())
    text_index = None
    for i in range(0, len(tokens)):
        if tokens[i].type == "text":
            # Find the last Text so when we remove
            # and EndText the next Text won't close it
            text_index = i
    if text_index is None:
        token = draw(draw_token_text())
        min = 0
        max = len(tokens)
        # Don't put a text before a shebang
        if max > 0 and tokens[0].code[0:2] == "#!":
            min = 1
        text_index = draw(integers(min_value=min, max_value=max))
        tokens.insert(text_index, token)
    text_token = tokens[text_index]
    lex_tokens = split_tokens(text_token.code, lexer_whitespace)
    new_code = "".join(lex_tokens[:-1])
    error_token = SampleToken(new_code, None, None)
    tokens[text_index] = error_token
    soup = draw(draw_token_soup(tokens))
    return (error_token, soup)


# Test that we can lex tokens correctly
@given(draw_soup_valid())
def test_lexer_valid(soup):
    try:
        tokenizer = parse.Tokenizer(soup.code, soup.filename)
        tokens = tokenizer.tokenize()
    except parse.ParseError as e:
        raise AssertionError("ParseError thrown: %s" % (e))
    assert len(tokens) <= len(soup.tokens)
    in_pos = 0
    out_pos = 0
    while in_pos < len(soup.tokens):
        if soup.tokens[in_pos].type:
            assert tokens[out_pos] == soup.tokens[in_pos]
            out_pos += 1
        in_pos += 1
    assert in_pos == len(soup.tokens)
    assert out_pos == len(tokens)


# Test that we can catch a stray EndText
@given(draw_error_endtext_stray())
def test_lexer_error_endtext_stray(error_soup):
    (error_token, soup) = error_soup
    try:
        tokenizer = parse.Tokenizer(soup.code, soup.filename)
        tokens = tokenizer.tokenize()
        assert False  # Success?
    except parse.ParseError as e:
        location = error_token.location
        assert e.context.parent == None
        assert e.context.context == "reading word"
        assert e.context.location == location
        assert e.error == "Found stray EndText"


# Test that removing an EndText from the last available text
# pair causes some error, either from the text data being read as
# code or from the text being unterminated
@given(draw_error_endtext_remove())
def test_lexer_error_endtext_remove(error_soup):
    (error_token, soup) = error_soup
    try:
        tokenizer = parse.Tokenizer(soup.code, soup.filename)
        tokens = tokenizer.tokenize()
        assert False  # Success?
    except parse.ParseError as e:
        location = error_token.location
        assert e.context.parent == None
        # e.context.context will be random
        assert e.context.location.line >= location.line
        if e.context.location.line == location.line:
            assert e.context.location.column >= location.column
        # e.error will be random


# General fuzz test, make sure the parser doesn't fall apart and spew
# uncontrolled errors.
@given(text(), text())
def test_parser_fuzz(code, filename):
    try:
        tokenizer = parse.Tokenizer(code, filename)
        tokens = tokenizer.tokenize()
        parser = parse.Parser(tokens)
        parser.parse_file()
    except parse.ParseError:
        pass