diff --git a/src/main.py b/src/main.py index d60cf6b..8df3b19 100755 --- a/src/main.py +++ b/src/main.py @@ -11,7 +11,7 @@ sys.path.append(repo_dir) from src import log -from src import parse +from src import oldparse from src import interp @@ -21,7 +21,7 @@ print("Usage: main.py FILENAME") return 1 log.set_default_log_level() - ast = parse.parse_file(args[1]) + ast = oldparse.parse_file(args[1]) if not ast: return 1 interp.run_ast(ast) diff --git a/src/oldparse.py b/src/oldparse.py new file mode 100644 index 0000000..88c010f --- /dev/null +++ b/src/oldparse.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: LGPL-2.1-only +# Copyright 2021 Jookia + +from src import log +from src import ast_types + + +class ParseLocation: + def __init__(self, line, column, file): + self.line = line + self.column = column + self.file = file + + def __repr__(self): + return "ParseLocation(line %i, column %i, file '%s')" % ( + self.line, + self.column, + self.file, + ) + + def __eq__(self, other): + return ( + self.line == other.line + and self.column == other.column + and self.file == other.file + ) + + +class ParseContext: + def __init__(self, parent, context, location): + self.parent = parent + self.context = context + self.location = location + + def __repr__(self): + return "ParseContext(parent %s, context '%s', location %s)" % ( + self.parent, + self.context, + self.location, + ) + + +class ParseError(BaseException): + def __init__(self, context, error): + self.context = context + self.error = error + + def __str__(self): + return "ParseError(context %s, error '%s')" % (self.context, self.error) + + +class Token: + def __init__(self, type, value, location): + self.type = type + self.value = value + self.location = location + + def __repr__(self): + return "Token(type %s, value '%s', location %s)" % ( + self.type, + self.value, + self.location, + ) + + +class Word: + def __init__(self, value, position, line, column): + self.value = value + self.position = position + self.line = line + self.column = column + + def __repr__(self): + return "Word(value '%s', position %i, line %i, column %i)" % ( + self.value, + self.position, + self.line, + self.column, + ) + + +def is_whitespace(symbol): + return symbol == " " or symbol == "\t" or symbol == "\n" + + +class Tokenizer: + def __init__(self, input, filename): + self.code = input + self.pos = -1 + self.pos_line = 1 + self.pos_column = 0 + self.filename = filename + + def next(self): + if self.pos == len(self.code) - 1: + log.log(log.LEXER, log.TRACE, "Reached end of file") + return None + else: + self.pos += 1 + symbol = self.code[self.pos] + if symbol == "\n": + self.pos_line += 1 + self.pos_column = 0 + else: + self.pos_column += 1 + pos_string = "line %i column %i" % (self.pos_line, self.pos_column) + symbol_string = symbol + if is_whitespace(symbol): + symbol_string = repr(symbol) + log.log( + log.LEXER, + log.TRACE, + "Read character %s at %s" % (symbol_string, pos_string), + ) + return symbol + + def read_word(self): + value = "" + symbol = self.next() + while is_whitespace(symbol): + log.log(log.LEXER, log.TRACE, "Skipping whitespace") + symbol = self.next() + if not symbol: + log.log(log.LEXER, log.TRACE, "No word to read") + return None + pos = self.pos + line = self.pos_line + column = self.pos_column + while symbol and not is_whitespace(symbol): + value += symbol + symbol = self.next() + word = Word(value, pos, line, column) + log.log(log.LEXER, log.TRACE, "Read %s" % (word)) + return word + + def skip_note(self, line, column): + log.log(log.LEXER, log.TRACE, "Skipping words until EndNote") + context = ParseContext( + None, "reading note", ParseLocation(line, column, self.filename) + ) + word = self.read_word() + while word and word.value != "EndNote": + word = self.read_word() + if not word: + raise ParseError(context, "Hit end of file before EndNote") + + def read_text(self, line, column): + log.log(log.LEXER, log.TRACE, "Reading characters until EndText") + context = ParseContext( + None, "reading text", ParseLocation(line, column, self.filename) + ) + start = self.pos + word = self.read_word() + while word and word.value != "EndText": + word = self.read_word() + if not word: + raise ParseError(context, "Hit end of file before EndText") + else: + return self.code[start + 1 : word.position - 1].strip("\n\t ") + + def skip_shebang(self): + log.log(log.LEXER, log.TRACE, "Skipping shebang") + next_line = self.code.find("\n") + 1 + self.code = self.code[next_line:] + self.pos_line = 2 + + def tokenize(self): + if self.code[0:2] == "#!": + self.skip_shebang() + keywords = [ + "NewLang", + "Done", + "Set", + "To", + "EndSet", + "If", + "Then", + "Else", + "EndIf", + ] + tokens = [] + word = self.read_word() + while word: + token = word.value + line = word.line + column = word.column + context = ParseContext( + None, "reading word", ParseLocation(line, column, self.filename) + ) + if token == "StartNote": + self.skip_note(line, column) + word = self.read_word() + continue + elif token == "EndNote": + raise ParseError(context, "Found stray EndNote") + elif token == "StartText": + type = "text" + value = self.read_text(line, column) + elif token == "EndText": + raise ParseError(context, "Found stray EndText") + elif token in ["True", "False"]: + type = "bool" + value = token == "True" + elif token in keywords: + type = "keyword" + value = token + else: + type = "symbol" + value = token + tok = Token(type, value, ParseLocation(line, column, self.filename)) + log.log(log.LEXER, log.DEBUG, "Appending %s" % (tok)) + tokens.append(tok) + word = self.read_word() + log.log(log.LEXER, log.TRACE, "Done tokenizing, adding EOF") + tokens.append( + Token( + "EOF", + None, + ParseLocation(self.pos_line, self.pos_column, self.filename), + ) + ) + log.log(log.LEXER, log.DEBUG, "Tokens are %s" % (tokens)) + return tokens + + +class Parser: + def __init__(self, tokens): + self.tokens = tokens + self.pos = 0 + + def next(self): + token = self.tokens[self.pos] + if self.pos < (len(self.tokens) - 1): + self.pos += 1 + log.log(log.PARSER, log.TRACE, "Read %s" % (token)) + return token + + def peek(self): + token = self.tokens[self.pos] + log.log(log.PARSER, log.TRACE, "Peeked %s" % (token)) + return token + + def eof(self): + return self.tokens[self.pos].type == "EOF" + + def create_context(self, context, text): + token = self.tokens[self.pos] + return ParseContext(context, text, token.location) + + def parse_version(self, context): + log.log(log.PARSER, log.TRACE, "Parsing version identifier...") + context = self.create_context(context, "parsing version identifier") + token = self.next() + if token.type != "keyword" or token.value != "NewLang": + raise ParseError( + context, "Expected 'NewLang' keyword, got '%s'" % (token.value) + ) + token = self.next() + version = token.value + if version != "0": + raise ParseError(context, "Unknown version '%s'" % (version)) + log.log(log.PARSER, log.DEBUG, "Parsed version %s" % (version)) + return version + + def parse_value(self, context, subject, type, value): + log.log(log.PARSER, log.TRACE, "Parsing value...") + if type == "symbol": + ret = ast_types.Reference(value) + elif type == "text": + ret = ast_types.Text(value) + elif type == "bool": + ret = ast_types.Bool(value) + else: + raise ParseError(context, "Unexpected value type %s" % (type)) + log.log(log.PARSER, log.TRACE, "Parsed value, AST is %s" % (ret)) + return ret + + def parse_arguments(self, meta_context, terminator): + log.log(log.PARSER, log.TRACE, "Parsing arguments until '%s'..." % (terminator)) + context = self.create_context(meta_context, "parsing statement arguments") + args = [] + arg_num = 1 + while True: + log.log(log.PARSER, log.TRACE, "Parsing argument %i..." % (arg_num)) + arg_context = self.create_context( + context, "parsing argument %i" % (arg_num) + ) + end_context = self.create_context(context, "parsing terminator") + token = self.next() + arg_num += 1 + if token.type == "keyword": + if token.value == terminator: + log.log( + log.PARSER, log.TRACE, "Parsed arguments, AST is %s" % (args) + ) + return args + else: + raise ParseError( + end_context, "Expected %s, got %s" % (terminator, token.value) + ) + else: + arg = self.parse_value(arg_context, "argument", token.type, token.value) + log.log(log.PARSER, log.TRACE, "Parsed argument %s" % (arg)) + args.append(arg) + + def parse_statement(self, context, terminator, type): + log.log( + log.PARSER, + log.TRACE, + "Parsing %s statement until '%s'..." % (type, terminator), + ) + meta_context = self.create_context(context, "parsing %s statement" % (type)) + log.log(log.PARSER, log.TRACE, "Parsing statement subject...") + context = self.create_context(meta_context, "parsing subject") + token = self.next() + subject = self.parse_value(context, "subject", token.type, token.value) + log.log(log.PARSER, log.TRACE, "Parsing statement verb...") + context = self.create_context(meta_context, "parsing statement verb") + end_context = self.create_context(context, "parsing terminator") + token = self.next() + if token.type == "keyword": + if token.value == terminator: + verb = None + else: + raise ParseError( + end_context, "Expected %s, got %s" % (terminator, token.value) + ) + elif token.type == "symbol": + verb = token.value + else: + raise ParseError(context, "Expected symbol, got %s" % (token.type)) + log.log(log.PARSER, log.TRACE, "Parsing statement arguments...") + if verb: + arguments = self.parse_arguments(meta_context, terminator) + else: + arguments = [] + statement = ast_types.Statement(subject, verb, arguments) + log.log(log.PARSER, log.DEBUG, "Parsed statement, AST is %s" % (statement)) + return statement + + def parse_set(self, context): + log.log(log.PARSER, log.TRACE, "Parsing set directive...") + meta_context = self.create_context(context, "parsing set directive") + self.next() # Skip 'Set' + log.log(log.PARSER, log.TRACE, "Parsing set subject...") + context = self.create_context(meta_context, "parsing subject") + token = self.next() + if token.type != "symbol": + raise ParseError(context, "Expected symbol, got %s" % (token.type)) + subject = token.value + log.log(log.PARSER, log.TRACE, "Parsing set separator...") + context = self.create_context(meta_context, "parsing set separator") + token = self.next() + if token.type != "keyword" or token.value != "To": + pretty_value = token.value + if token.type != "keyword": + pretty_value = "'%s'" % (pretty_value) + raise ParseError(context, "Expected To, got %s" % (pretty_value)) + log.log(log.PARSER, log.TRACE, "Parsing set value...") + ast = self.parse_statement(meta_context, "EndSet", "set value") + set = ast_types.Set(subject, ast) + log.log(log.PARSER, log.DEBUG, "Parsed set, AST is %s" % (set)) + return set + + def parse_if(self, context): + log.log(log.PARSER, log.TRACE, "Parsing if directive...") + context = self.create_context(context, "parsing if directive") + self.next() # Skip 'If' + test = self.parse_statement(context, "Then", "test condition") + log.log(log.PARSER, log.TRACE, "Parsing if success statement...") + success = self.parse_statement(context, "Else", "success") + log.log(log.PARSER, log.TRACE, "Parsing if failure statement...") + failure = self.parse_statement(context, "EndIf", "failure") + conditional = ast_types.Conditional(test, success, failure) + log.log(log.PARSER, log.DEBUG, "Parsed if, AST is %s" % (conditional)) + return conditional + + def parse_directive(self, context): + token = self.peek() + if token.type != "keyword" and token.type != "symbol" and token.type != "bool": + raise ParseError( + context, "Expected keyword, symbol or bool, got %s" % (token.type) + ) + if token.type == "keyword": + if token.value == "Set": + return self.parse_set(context) + elif token.value == "If": + return self.parse_if(context) + else: + raise ParseError(context, "Unexpected keyword %s" % (token.value)) + else: + ast = self.parse_statement(context, "Done", "command") + return ast + + def parse_file(self): + log.log(log.PARSER, log.TRACE, "Parsing file...") + ast = [] + self.parse_version(None) + while not self.eof(): + log.log(log.PARSER, log.TRACE, "Parsing next directive in file...") + ast.append(self.parse_directive(None)) + log.log(log.PARSER, log.DEBUG, "Parsed file, AST is %s" % (ast)) + return ast + + +def parse_file(filename): + try: + code = open(filename, encoding="utf-8").read() + except UnicodeDecodeError: + print("Parse error: %s is not valid UTF-8" % (filename)) + return None + try: + tokenizer = Tokenizer(code, filename) + tokens = tokenizer.tokenize() + parser = Parser(tokens) + return parser.parse_file() + except ParseError as e: + print("Parse error: %s" % (e.error)) + context = e.context + while context: + line = context.location.line + column = context.location.column + print("While %s at line %i column %i" % (context.context, line, column)) + context = context.parent + print("While parsing file %s" % (filename)) + return None diff --git a/src/parse.py b/src/parse.py deleted file mode 100644 index 88c010f..0000000 --- a/src/parse.py +++ /dev/null @@ -1,426 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-only -# Copyright 2021 Jookia - -from src import log -from src import ast_types - - -class ParseLocation: - def __init__(self, line, column, file): - self.line = line - self.column = column - self.file = file - - def __repr__(self): - return "ParseLocation(line %i, column %i, file '%s')" % ( - self.line, - self.column, - self.file, - ) - - def __eq__(self, other): - return ( - self.line == other.line - and self.column == other.column - and self.file == other.file - ) - - -class ParseContext: - def __init__(self, parent, context, location): - self.parent = parent - self.context = context - self.location = location - - def __repr__(self): - return "ParseContext(parent %s, context '%s', location %s)" % ( - self.parent, - self.context, - self.location, - ) - - -class ParseError(BaseException): - def __init__(self, context, error): - self.context = context - self.error = error - - def __str__(self): - return "ParseError(context %s, error '%s')" % (self.context, self.error) - - -class Token: - def __init__(self, type, value, location): - self.type = type - self.value = value - self.location = location - - def __repr__(self): - return "Token(type %s, value '%s', location %s)" % ( - self.type, - self.value, - self.location, - ) - - -class Word: - def __init__(self, value, position, line, column): - self.value = value - self.position = position - self.line = line - self.column = column - - def __repr__(self): - return "Word(value '%s', position %i, line %i, column %i)" % ( - self.value, - self.position, - self.line, - self.column, - ) - - -def is_whitespace(symbol): - return symbol == " " or symbol == "\t" or symbol == "\n" - - -class Tokenizer: - def __init__(self, input, filename): - self.code = input - self.pos = -1 - self.pos_line = 1 - self.pos_column = 0 - self.filename = filename - - def next(self): - if self.pos == len(self.code) - 1: - log.log(log.LEXER, log.TRACE, "Reached end of file") - return None - else: - self.pos += 1 - symbol = self.code[self.pos] - if symbol == "\n": - self.pos_line += 1 - self.pos_column = 0 - else: - self.pos_column += 1 - pos_string = "line %i column %i" % (self.pos_line, self.pos_column) - symbol_string = symbol - if is_whitespace(symbol): - symbol_string = repr(symbol) - log.log( - log.LEXER, - log.TRACE, - "Read character %s at %s" % (symbol_string, pos_string), - ) - return symbol - - def read_word(self): - value = "" - symbol = self.next() - while is_whitespace(symbol): - log.log(log.LEXER, log.TRACE, "Skipping whitespace") - symbol = self.next() - if not symbol: - log.log(log.LEXER, log.TRACE, "No word to read") - return None - pos = self.pos - line = self.pos_line - column = self.pos_column - while symbol and not is_whitespace(symbol): - value += symbol - symbol = self.next() - word = Word(value, pos, line, column) - log.log(log.LEXER, log.TRACE, "Read %s" % (word)) - return word - - def skip_note(self, line, column): - log.log(log.LEXER, log.TRACE, "Skipping words until EndNote") - context = ParseContext( - None, "reading note", ParseLocation(line, column, self.filename) - ) - word = self.read_word() - while word and word.value != "EndNote": - word = self.read_word() - if not word: - raise ParseError(context, "Hit end of file before EndNote") - - def read_text(self, line, column): - log.log(log.LEXER, log.TRACE, "Reading characters until EndText") - context = ParseContext( - None, "reading text", ParseLocation(line, column, self.filename) - ) - start = self.pos - word = self.read_word() - while word and word.value != "EndText": - word = self.read_word() - if not word: - raise ParseError(context, "Hit end of file before EndText") - else: - return self.code[start + 1 : word.position - 1].strip("\n\t ") - - def skip_shebang(self): - log.log(log.LEXER, log.TRACE, "Skipping shebang") - next_line = self.code.find("\n") + 1 - self.code = self.code[next_line:] - self.pos_line = 2 - - def tokenize(self): - if self.code[0:2] == "#!": - self.skip_shebang() - keywords = [ - "NewLang", - "Done", - "Set", - "To", - "EndSet", - "If", - "Then", - "Else", - "EndIf", - ] - tokens = [] - word = self.read_word() - while word: - token = word.value - line = word.line - column = word.column - context = ParseContext( - None, "reading word", ParseLocation(line, column, self.filename) - ) - if token == "StartNote": - self.skip_note(line, column) - word = self.read_word() - continue - elif token == "EndNote": - raise ParseError(context, "Found stray EndNote") - elif token == "StartText": - type = "text" - value = self.read_text(line, column) - elif token == "EndText": - raise ParseError(context, "Found stray EndText") - elif token in ["True", "False"]: - type = "bool" - value = token == "True" - elif token in keywords: - type = "keyword" - value = token - else: - type = "symbol" - value = token - tok = Token(type, value, ParseLocation(line, column, self.filename)) - log.log(log.LEXER, log.DEBUG, "Appending %s" % (tok)) - tokens.append(tok) - word = self.read_word() - log.log(log.LEXER, log.TRACE, "Done tokenizing, adding EOF") - tokens.append( - Token( - "EOF", - None, - ParseLocation(self.pos_line, self.pos_column, self.filename), - ) - ) - log.log(log.LEXER, log.DEBUG, "Tokens are %s" % (tokens)) - return tokens - - -class Parser: - def __init__(self, tokens): - self.tokens = tokens - self.pos = 0 - - def next(self): - token = self.tokens[self.pos] - if self.pos < (len(self.tokens) - 1): - self.pos += 1 - log.log(log.PARSER, log.TRACE, "Read %s" % (token)) - return token - - def peek(self): - token = self.tokens[self.pos] - log.log(log.PARSER, log.TRACE, "Peeked %s" % (token)) - return token - - def eof(self): - return self.tokens[self.pos].type == "EOF" - - def create_context(self, context, text): - token = self.tokens[self.pos] - return ParseContext(context, text, token.location) - - def parse_version(self, context): - log.log(log.PARSER, log.TRACE, "Parsing version identifier...") - context = self.create_context(context, "parsing version identifier") - token = self.next() - if token.type != "keyword" or token.value != "NewLang": - raise ParseError( - context, "Expected 'NewLang' keyword, got '%s'" % (token.value) - ) - token = self.next() - version = token.value - if version != "0": - raise ParseError(context, "Unknown version '%s'" % (version)) - log.log(log.PARSER, log.DEBUG, "Parsed version %s" % (version)) - return version - - def parse_value(self, context, subject, type, value): - log.log(log.PARSER, log.TRACE, "Parsing value...") - if type == "symbol": - ret = ast_types.Reference(value) - elif type == "text": - ret = ast_types.Text(value) - elif type == "bool": - ret = ast_types.Bool(value) - else: - raise ParseError(context, "Unexpected value type %s" % (type)) - log.log(log.PARSER, log.TRACE, "Parsed value, AST is %s" % (ret)) - return ret - - def parse_arguments(self, meta_context, terminator): - log.log(log.PARSER, log.TRACE, "Parsing arguments until '%s'..." % (terminator)) - context = self.create_context(meta_context, "parsing statement arguments") - args = [] - arg_num = 1 - while True: - log.log(log.PARSER, log.TRACE, "Parsing argument %i..." % (arg_num)) - arg_context = self.create_context( - context, "parsing argument %i" % (arg_num) - ) - end_context = self.create_context(context, "parsing terminator") - token = self.next() - arg_num += 1 - if token.type == "keyword": - if token.value == terminator: - log.log( - log.PARSER, log.TRACE, "Parsed arguments, AST is %s" % (args) - ) - return args - else: - raise ParseError( - end_context, "Expected %s, got %s" % (terminator, token.value) - ) - else: - arg = self.parse_value(arg_context, "argument", token.type, token.value) - log.log(log.PARSER, log.TRACE, "Parsed argument %s" % (arg)) - args.append(arg) - - def parse_statement(self, context, terminator, type): - log.log( - log.PARSER, - log.TRACE, - "Parsing %s statement until '%s'..." % (type, terminator), - ) - meta_context = self.create_context(context, "parsing %s statement" % (type)) - log.log(log.PARSER, log.TRACE, "Parsing statement subject...") - context = self.create_context(meta_context, "parsing subject") - token = self.next() - subject = self.parse_value(context, "subject", token.type, token.value) - log.log(log.PARSER, log.TRACE, "Parsing statement verb...") - context = self.create_context(meta_context, "parsing statement verb") - end_context = self.create_context(context, "parsing terminator") - token = self.next() - if token.type == "keyword": - if token.value == terminator: - verb = None - else: - raise ParseError( - end_context, "Expected %s, got %s" % (terminator, token.value) - ) - elif token.type == "symbol": - verb = token.value - else: - raise ParseError(context, "Expected symbol, got %s" % (token.type)) - log.log(log.PARSER, log.TRACE, "Parsing statement arguments...") - if verb: - arguments = self.parse_arguments(meta_context, terminator) - else: - arguments = [] - statement = ast_types.Statement(subject, verb, arguments) - log.log(log.PARSER, log.DEBUG, "Parsed statement, AST is %s" % (statement)) - return statement - - def parse_set(self, context): - log.log(log.PARSER, log.TRACE, "Parsing set directive...") - meta_context = self.create_context(context, "parsing set directive") - self.next() # Skip 'Set' - log.log(log.PARSER, log.TRACE, "Parsing set subject...") - context = self.create_context(meta_context, "parsing subject") - token = self.next() - if token.type != "symbol": - raise ParseError(context, "Expected symbol, got %s" % (token.type)) - subject = token.value - log.log(log.PARSER, log.TRACE, "Parsing set separator...") - context = self.create_context(meta_context, "parsing set separator") - token = self.next() - if token.type != "keyword" or token.value != "To": - pretty_value = token.value - if token.type != "keyword": - pretty_value = "'%s'" % (pretty_value) - raise ParseError(context, "Expected To, got %s" % (pretty_value)) - log.log(log.PARSER, log.TRACE, "Parsing set value...") - ast = self.parse_statement(meta_context, "EndSet", "set value") - set = ast_types.Set(subject, ast) - log.log(log.PARSER, log.DEBUG, "Parsed set, AST is %s" % (set)) - return set - - def parse_if(self, context): - log.log(log.PARSER, log.TRACE, "Parsing if directive...") - context = self.create_context(context, "parsing if directive") - self.next() # Skip 'If' - test = self.parse_statement(context, "Then", "test condition") - log.log(log.PARSER, log.TRACE, "Parsing if success statement...") - success = self.parse_statement(context, "Else", "success") - log.log(log.PARSER, log.TRACE, "Parsing if failure statement...") - failure = self.parse_statement(context, "EndIf", "failure") - conditional = ast_types.Conditional(test, success, failure) - log.log(log.PARSER, log.DEBUG, "Parsed if, AST is %s" % (conditional)) - return conditional - - def parse_directive(self, context): - token = self.peek() - if token.type != "keyword" and token.type != "symbol" and token.type != "bool": - raise ParseError( - context, "Expected keyword, symbol or bool, got %s" % (token.type) - ) - if token.type == "keyword": - if token.value == "Set": - return self.parse_set(context) - elif token.value == "If": - return self.parse_if(context) - else: - raise ParseError(context, "Unexpected keyword %s" % (token.value)) - else: - ast = self.parse_statement(context, "Done", "command") - return ast - - def parse_file(self): - log.log(log.PARSER, log.TRACE, "Parsing file...") - ast = [] - self.parse_version(None) - while not self.eof(): - log.log(log.PARSER, log.TRACE, "Parsing next directive in file...") - ast.append(self.parse_directive(None)) - log.log(log.PARSER, log.DEBUG, "Parsed file, AST is %s" % (ast)) - return ast - - -def parse_file(filename): - try: - code = open(filename, encoding="utf-8").read() - except UnicodeDecodeError: - print("Parse error: %s is not valid UTF-8" % (filename)) - return None - try: - tokenizer = Tokenizer(code, filename) - tokens = tokenizer.tokenize() - parser = Parser(tokens) - return parser.parse_file() - except ParseError as e: - print("Parse error: %s" % (e.error)) - context = e.context - while context: - line = context.location.line - column = context.location.column - print("While %s at line %i column %i" % (context.context, line, column)) - context = context.parent - print("While parsing file %s" % (filename)) - return None diff --git a/tests/test_oldparse.py b/tests/test_oldparse.py new file mode 100644 index 0000000..94b2226 --- /dev/null +++ b/tests/test_oldparse.py @@ -0,0 +1,281 @@ +# SPDX-License-Identifier: LGPL-2.1-only +# Copyright 2022 Jookia + +from hypothesis import given, assume +from hypothesis.strategies import ( + text, + booleans, + sampled_from, + one_of, + characters, + lists, + composite, + randoms, +) + +from src import oldparse + +# Whitespace that separates lexer words +lexer_whitespace = "\n\t " + +# List of keywords the lexer understands +keywords = [ + "NewLang", + "Done", + "Set", + "To", + "EndSet", + "If", + "Then", + "Else", + "EndIf", +] + + +# List of words the lexer understands +reserved_words = keywords + [ + "StartText", + "EndText", + "StartNote", + "EndNote", + "True", + "False", +] + + +# A sample token containing code to create a lexer token, and +# the resulting lexer type and value +# An type and value of 'None' is used for lexer code that +# should get ignored, such as shebangs and notes +class SampleToken: + def __init__(self, code, type, value): + self.code = code + self.type = type + self.value = value + self.location = None + + def __repr__(self): + return "SampleToken(code %s, type '%s', value %s, location %s)" % ( + repr(self.code), + self.type, + repr(self.value), + self.location, + ) + + def __eq__(self, other): + return ( + # Don't check code + self.value == other.value + and self.type == other.type + and self.location.file == other.location.file + and self.location.line == other.location.line + and self.location.column == other.location.column + ) + + +# A soup of sample tokens +class SampleSoup: + def __init__(self, tokens, code, filename): + self.tokens = tokens + self.code = code + self.filename = filename + + def __repr__(self): + return "SampleSoup(tokens %s, code %s, filename '%s')" % ( + self.tokens, + repr(self.code), + self.filename, + ) + + +# Draws a textual identifier consisting of random characters and reserved words +@composite +def draw_identifier(draw): + identifiers = draw( + lists( + text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) + ) + ) + # If we have no identifiers, draw at least two words so we don't accidentally + # draw a reserved word alone. + min_words = 2 if len(identifiers) == 0 else 0 + words = draw(lists(sampled_from(reserved_words), min_size=min_words)) + all_words = identifiers + words + draw(randoms()).shuffle(all_words) + value = "".join(all_words) + assume(value not in reserved_words) # Reserved words aren't symbols + assume(not value.startswith("#!")) # Shebangs aren't symbols + return value + + +# Draws whitespace ignored by the lexer +@composite +def draw_whitespace(draw): + return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) + + +# Draws a list of words separated by whitespace +@composite +def draw_joined_words(draw, words): + output = "" + for word in words[0:1]: + # No space before the first word + output += word + for word in words[1:]: + space = draw(draw_whitespace()) + output += space + word + return output + + +# Draws zero or more words made of identifiers and reserved words +@composite +def draw_word_salad(draw, exclude_words): + reserved = reserved_words.copy() + for exclude in exclude_words: + reserved.remove(exclude) + strategies = [ + draw_identifier(), + sampled_from(reserved), + ] + words = draw(lists(one_of(strategies))) + return draw(draw_joined_words(words)) + + +# Generates a Text token +@composite +def draw_token_text(draw): + value = draw(draw_word_salad(["StartText", "EndText"])) + space1 = draw(draw_whitespace()) + space2 = draw(draw_whitespace()) + code = "StartText" + space1 + value + space2 + "EndText" + return SampleToken(code, "text", value.strip(lexer_whitespace)) + + +# Generates a Bool token +@composite +def draw_token_bool(draw): + bool = draw(booleans()) + if bool is True: + code = "True" + else: + code = "False" + return SampleToken(code, "bool", bool) + + +# Generates a keyword token +@composite +def draw_token_keyword(draw): + keyword = draw(sampled_from(keywords)) + return SampleToken(keyword, "keyword", keyword) + + +# Generates a symbol token +@composite +def draw_token_symbol(draw): + symbol = draw(draw_identifier()) + return SampleToken(symbol, "symbol", symbol) + + +# Generates a note token +@composite +def draw_token_note(draw): + value = draw(draw_word_salad(["StartNote", "EndNote"])) + space1 = draw(draw_whitespace()) + space2 = draw(draw_whitespace()) + code = "StartNote" + space1 + value + space2 + "EndNote" + return SampleToken(code, None, None) + + +# Generates a shebang token +@composite +def draw_token_shebang(draw): + shebang = draw(draw_word_salad([])).replace("\n", "") + code = "#!" + shebang + "\n" + return SampleToken(code, None, None) + + +# Generates an empty token +@composite +def draw_token_empty(draw): + return SampleToken("", None, None) + + +# Generates a set of valid tokens +@composite +def draw_tokens_valid(draw): + strategies = [ + draw_token_text(), + draw_token_bool(), + draw_token_keyword(), + draw_token_symbol(), + draw_token_note(), + draw_token_empty(), + ] + shebang = draw(lists(draw_token_shebang(), max_size=1)) + tokens = draw(lists(one_of(strategies))) + all_tokens = shebang + tokens + return all_tokens + + +# Generates a soup of tokens using a given strategy +@composite +def draw_token_soup(draw, all_tokens): + filename = draw(text()) + code = "" + curr_line = 1 + curr_column = 1 + for token in all_tokens: + space = draw(draw_whitespace()) + new_code = token.code + space + lines = new_code.split("\n") + code += new_code + token.location = oldparse.ParseLocation(curr_line, curr_column, filename) + curr_line += len(lines) - 1 + if len(lines) > 1: + curr_column = len(lines[-1]) + 1 + else: + curr_column += len(new_code) + eof = SampleToken(None, "EOF", None) + eof.location = oldparse.ParseLocation(curr_line, curr_column - 1, filename) + return SampleSoup(all_tokens + [eof], code, filename) + + +# Generates a soup of valid tokens +@composite +def draw_soup_valid(draw): + tokens = draw(draw_tokens_valid()) + soup = draw(draw_token_soup(tokens)) + return soup + + +# Test that we can lex tokens correctly +@given(draw_soup_valid()) +def test_lexer_valid(soup): + try: + tokenizer = oldparse.Tokenizer(soup.code, soup.filename) + tokens = tokenizer.tokenize() + except oldparse.ParseError as e: + raise AssertionError("ParseError thrown: %s" % (e)) + assert len(tokens) <= len(soup.tokens) + in_pos = 0 + out_pos = 0 + while in_pos < len(soup.tokens): + if soup.tokens[in_pos].type: + assert tokens[out_pos] == soup.tokens[in_pos] + out_pos += 1 + in_pos += 1 + assert in_pos == len(soup.tokens) + assert out_pos == len(tokens) + + +# General fuzz test, make sure the parser doesn't fall apart and spew +# uncontrolled errors. +@given(text(), text()) +def test_oldparser_fuzz(code, filename): + try: + tokenizer = oldparse.Tokenizer(code, filename) + tokens = tokenizer.tokenize() + parser = oldparse.Parser(tokens) + parser.parse_file() + except oldparse.ParseError: + pass diff --git a/tests/test_oldparse_regress.py b/tests/test_oldparse_regress.py new file mode 100644 index 0000000..298a953 --- /dev/null +++ b/tests/test_oldparse_regress.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: LGPL-2.1-only +# Copyright 2022 Jookia + +from src import oldparse + + +# The parser had some logic along the lines of 'read token until whitespace', +# but this didn't account for hitting the end of file. +# Make sure the parser can handle tokens terminated by end of file correctly. +def test_regress_eof(): + tokenizer = oldparse.Tokenizer("Hello", "") + tokens = tokenizer.tokenize() + assert tokens[0].value == "Hello" + + +# The parser would read text literals by tracking the position just after of +# the StartText and EndText tokens, then reading the literal text between them. +# It would automatically remove EndText as well as the character after it. +# However, if EndText was the last token, this would cause the text to cut off. +# Make sure the parser can handle reading text at the end of a file. +def test_regress_text_eof(): + text = "Hi there!" + code = "StartText " + text + " EndText" + tokenizer1 = oldparse.Tokenizer(code, "") + tokens1 = tokenizer1.tokenize() + tokenizer2 = oldparse.Tokenizer(code + " ", "") + tokens2 = tokenizer2.tokenize() + assert tokens1[0].type == "text" + assert tokens1[0].value == text + assert tokens2[0].type == "text" + assert tokens2[0].value == text diff --git a/tests/test_parse.py b/tests/test_parse.py deleted file mode 100644 index fdbd4af..0000000 --- a/tests/test_parse.py +++ /dev/null @@ -1,281 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-only -# Copyright 2022 Jookia - -from hypothesis import given, assume -from hypothesis.strategies import ( - text, - booleans, - sampled_from, - one_of, - characters, - lists, - composite, - randoms, -) - -from src import parse - -# Whitespace that separates lexer words -lexer_whitespace = "\n\t " - -# List of keywords the lexer understands -keywords = [ - "NewLang", - "Done", - "Set", - "To", - "EndSet", - "If", - "Then", - "Else", - "EndIf", -] - - -# List of words the lexer understands -reserved_words = keywords + [ - "StartText", - "EndText", - "StartNote", - "EndNote", - "True", - "False", -] - - -# A sample token containing code to create a lexer token, and -# the resulting lexer type and value -# An type and value of 'None' is used for lexer code that -# should get ignored, such as shebangs and notes -class SampleToken: - def __init__(self, code, type, value): - self.code = code - self.type = type - self.value = value - self.location = None - - def __repr__(self): - return "SampleToken(code %s, type '%s', value %s, location %s)" % ( - repr(self.code), - self.type, - repr(self.value), - self.location, - ) - - def __eq__(self, other): - return ( - # Don't check code - self.value == other.value - and self.type == other.type - and self.location.file == other.location.file - and self.location.line == other.location.line - and self.location.column == other.location.column - ) - - -# A soup of sample tokens -class SampleSoup: - def __init__(self, tokens, code, filename): - self.tokens = tokens - self.code = code - self.filename = filename - - def __repr__(self): - return "SampleSoup(tokens %s, code %s, filename '%s')" % ( - self.tokens, - repr(self.code), - self.filename, - ) - - -# Draws a textual identifier consisting of random characters and reserved words -@composite -def draw_identifier(draw): - identifiers = draw( - lists( - text(alphabet=characters(blacklist_characters=lexer_whitespace), min_size=1) - ) - ) - # If we have no identifiers, draw at least two words so we don't accidentally - # draw a reserved word alone. - min_words = 2 if len(identifiers) == 0 else 0 - words = draw(lists(sampled_from(reserved_words), min_size=min_words)) - all_words = identifiers + words - draw(randoms()).shuffle(all_words) - value = "".join(all_words) - assume(value not in reserved_words) # Reserved words aren't symbols - assume(not value.startswith("#!")) # Shebangs aren't symbols - return value - - -# Draws whitespace ignored by the lexer -@composite -def draw_whitespace(draw): - return "".join(draw(lists(sampled_from(lexer_whitespace), min_size=1))) - - -# Draws a list of words separated by whitespace -@composite -def draw_joined_words(draw, words): - output = "" - for word in words[0:1]: - # No space before the first word - output += word - for word in words[1:]: - space = draw(draw_whitespace()) - output += space + word - return output - - -# Draws zero or more words made of identifiers and reserved words -@composite -def draw_word_salad(draw, exclude_words): - reserved = reserved_words.copy() - for exclude in exclude_words: - reserved.remove(exclude) - strategies = [ - draw_identifier(), - sampled_from(reserved), - ] - words = draw(lists(one_of(strategies))) - return draw(draw_joined_words(words)) - - -# Generates a Text token -@composite -def draw_token_text(draw): - value = draw(draw_word_salad(["StartText", "EndText"])) - space1 = draw(draw_whitespace()) - space2 = draw(draw_whitespace()) - code = "StartText" + space1 + value + space2 + "EndText" - return SampleToken(code, "text", value.strip(lexer_whitespace)) - - -# Generates a Bool token -@composite -def draw_token_bool(draw): - bool = draw(booleans()) - if bool is True: - code = "True" - else: - code = "False" - return SampleToken(code, "bool", bool) - - -# Generates a keyword token -@composite -def draw_token_keyword(draw): - keyword = draw(sampled_from(keywords)) - return SampleToken(keyword, "keyword", keyword) - - -# Generates a symbol token -@composite -def draw_token_symbol(draw): - symbol = draw(draw_identifier()) - return SampleToken(symbol, "symbol", symbol) - - -# Generates a note token -@composite -def draw_token_note(draw): - value = draw(draw_word_salad(["StartNote", "EndNote"])) - space1 = draw(draw_whitespace()) - space2 = draw(draw_whitespace()) - code = "StartNote" + space1 + value + space2 + "EndNote" - return SampleToken(code, None, None) - - -# Generates a shebang token -@composite -def draw_token_shebang(draw): - shebang = draw(draw_word_salad([])).replace("\n", "") - code = "#!" + shebang + "\n" - return SampleToken(code, None, None) - - -# Generates an empty token -@composite -def draw_token_empty(draw): - return SampleToken("", None, None) - - -# Generates a set of valid tokens -@composite -def draw_tokens_valid(draw): - strategies = [ - draw_token_text(), - draw_token_bool(), - draw_token_keyword(), - draw_token_symbol(), - draw_token_note(), - draw_token_empty(), - ] - shebang = draw(lists(draw_token_shebang(), max_size=1)) - tokens = draw(lists(one_of(strategies))) - all_tokens = shebang + tokens - return all_tokens - - -# Generates a soup of tokens using a given strategy -@composite -def draw_token_soup(draw, all_tokens): - filename = draw(text()) - code = "" - curr_line = 1 - curr_column = 1 - for token in all_tokens: - space = draw(draw_whitespace()) - new_code = token.code + space - lines = new_code.split("\n") - code += new_code - token.location = parse.ParseLocation(curr_line, curr_column, filename) - curr_line += len(lines) - 1 - if len(lines) > 1: - curr_column = len(lines[-1]) + 1 - else: - curr_column += len(new_code) - eof = SampleToken(None, "EOF", None) - eof.location = parse.ParseLocation(curr_line, curr_column - 1, filename) - return SampleSoup(all_tokens + [eof], code, filename) - - -# Generates a soup of valid tokens -@composite -def draw_soup_valid(draw): - tokens = draw(draw_tokens_valid()) - soup = draw(draw_token_soup(tokens)) - return soup - - -# Test that we can lex tokens correctly -@given(draw_soup_valid()) -def test_lexer_valid(soup): - try: - tokenizer = parse.Tokenizer(soup.code, soup.filename) - tokens = tokenizer.tokenize() - except parse.ParseError as e: - raise AssertionError("ParseError thrown: %s" % (e)) - assert len(tokens) <= len(soup.tokens) - in_pos = 0 - out_pos = 0 - while in_pos < len(soup.tokens): - if soup.tokens[in_pos].type: - assert tokens[out_pos] == soup.tokens[in_pos] - out_pos += 1 - in_pos += 1 - assert in_pos == len(soup.tokens) - assert out_pos == len(tokens) - - -# General fuzz test, make sure the parser doesn't fall apart and spew -# uncontrolled errors. -@given(text(), text()) -def test_parser_fuzz(code, filename): - try: - tokenizer = parse.Tokenizer(code, filename) - tokens = tokenizer.tokenize() - parser = parse.Parser(tokens) - parser.parse_file() - except parse.ParseError: - pass diff --git a/tests/test_parse_regress.py b/tests/test_parse_regress.py deleted file mode 100644 index 650907f..0000000 --- a/tests/test_parse_regress.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-only -# Copyright 2022 Jookia - -from src import parse - - -# The parser had some logic along the lines of 'read token until whitespace', -# but this didn't account for hitting the end of file. -# Make sure the parser can handle tokens terminated by end of file correctly. -def test_regress_eof(): - tokenizer = parse.Tokenizer("Hello", "") - tokens = tokenizer.tokenize() - assert tokens[0].value == "Hello" - - -# The parser would read text literals by tracking the position just after of -# the StartText and EndText tokens, then reading the literal text between them. -# It would automatically remove EndText as well as the character after it. -# However, if EndText was the last token, this would cause the text to cut off. -# Make sure the parser can handle reading text at the end of a file. -def test_regress_text_eof(): - text = "Hi there!" - code = "StartText " + text + " EndText" - tokenizer1 = parse.Tokenizer(code, "") - tokens1 = tokenizer1.tokenize() - tokenizer2 = parse.Tokenizer(code + " ", "") - tokens2 = tokenizer2.tokenize() - assert tokens1[0].type == "text" - assert tokens1[0].value == text - assert tokens2[0].type == "text" - assert tokens2[0].value == text