Newer
Older
NewLang / src / parse.py
# SPDX-License-Identifier: LGPL-2.1-or-later
# Copyright 2021 Jookia <contact@jookia.org>

from src import log
from src import ast_types


class ParseLocation:
    def __init__(self, line, column, file):
        self.line = line
        self.column = column
        self.file = file

    def __repr__(self):
        return "ParseLocation(line %i, column %i, file '%s')" % (
            self.line,
            self.column,
            self.file,
        )

    def __eq__(self, other):
        return (
            self.line == other.line
            and self.column == other.column
            and self.file == other.file
        )


class ParseContext:
    def __init__(self, parent, context, location):
        self.parent = parent
        self.context = context
        self.location = location

    def __repr__(self):
        return "ParseContext(parent %s, context '%s', location %s)" % (
            self.parent,
            self.context,
            self.location,
        )


class ParseError(BaseException):
    def __init__(self, context, error):
        self.context = context
        self.error = error

    def __str__(self):
        return "ParseError(context %s, error '%s')" % (self.context, self.error)


class Token:
    def __init__(self, type, value, location):
        self.type = type
        self.value = value
        self.location = location

    def __repr__(self):
        return "Token(type %s, value '%s', location %s)" % (
            self.type,
            self.value,
            self.location,
        )


class Word:
    def __init__(self, value, position, line, column):
        self.value = value
        self.position = position
        self.line = line
        self.column = column

    def __repr__(self):
        return "Word(value '%s', position %i, line %i, column %i)" % (
            self.value,
            self.position,
            self.line,
            self.column,
        )


def is_whitespace(symbol):
    return symbol == " " or symbol == "\t" or symbol == "\n"


class Tokenizer:
    def __init__(self, input, filename):
        self.code = input
        self.pos = -1
        self.pos_line = 1
        self.pos_column = 0
        self.filename = filename

    def next(self):
        if self.pos == len(self.code) - 1:
            log.log(log.LEXER, log.TRACE, "Reached end of file")
            return None
        else:
            self.pos += 1
            symbol = self.code[self.pos]
            if symbol == "\n":
                self.pos_line += 1
                self.pos_column = 0
            else:
                self.pos_column += 1
            pos_string = "line %i column %i" % (self.pos_line, self.pos_column)
            symbol_string = symbol
            if is_whitespace(symbol):
                symbol_string = repr(symbol)
            log.log(
                log.LEXER,
                log.TRACE,
                "Read character %s at %s" % (symbol_string, pos_string),
            )
            return symbol

    def read_word(self):
        value = ""
        symbol = self.next()
        while is_whitespace(symbol):
            log.log(log.LEXER, log.TRACE, "Skipping whitespace")
            symbol = self.next()
        if not symbol:
            log.log(log.LEXER, log.TRACE, "No word to read")
            return None
        pos = self.pos
        line = self.pos_line
        column = self.pos_column
        while symbol and not is_whitespace(symbol):
            value += symbol
            symbol = self.next()
        word = Word(value, pos, line, column)
        log.log(log.LEXER, log.TRACE, "Read %s" % (word))
        return word

    def skip_note(self, line, column):
        log.log(log.LEXER, log.TRACE, "Skipping words until EndNote")
        context = ParseContext(
            None, "reading note", ParseLocation(line, column, self.filename)
        )
        word = self.read_word()
        while word and word.value != "EndNote":
            word = self.read_word()
        if not word:
            raise ParseError(context, "Hit end of file before EndNote")

    def read_text(self, line, column):
        log.log(log.LEXER, log.TRACE, "Reading characters until EndText")
        context = ParseContext(
            None, "reading text", ParseLocation(line, column, self.filename)
        )
        start = self.pos
        word = self.read_word()
        while word and word.value != "EndText":
            word = self.read_word()
        if not word:
            raise ParseError(context, "Hit end of file before EndText")
        else:
            return self.code[start + 1 : word.position - 1].strip("\n\t ")

    def skip_shebang(self):
        log.log(log.LEXER, log.TRACE, "Skipping shebang")
        next_line = self.code.find("\n") + 1
        self.code = self.code[next_line:]
        self.pos_line = 2

    def tokenize(self):
        if self.code[0:2] == "#!":
            self.skip_shebang()
        keywords = [
            "NewLang",
            "Done",
            "Set",
            "To",
            "EndSet",
            "If",
            "Then",
            "Else",
            "EndIf",
        ]
        tokens = []
        word = self.read_word()
        while word:
            token = word.value
            line = word.line
            column = word.column
            context = ParseContext(
                None, "reading word", ParseLocation(line, column, self.filename)
            )
            if token == "StartNote":
                self.skip_note(line, column)
                word = self.read_word()
                continue
            elif token == "EndNote":
                raise ParseError(context, "Found stray EndNote")
            elif token == "StartText":
                type = "text"
                value = self.read_text(line, column)
            elif token == "EndText":
                raise ParseError(context, "Found stray EndText")
            elif token in ["True", "False"]:
                type = "bool"
                value = token == "True"
            elif token in keywords:
                type = "keyword"
                value = token
            else:
                type = "symbol"
                value = token
            tok = Token(type, value, ParseLocation(line, column, self.filename))
            log.log(log.LEXER, log.DEBUG, "Appending %s" % (tok))
            tokens.append(tok)
            word = self.read_word()
        log.log(log.LEXER, log.TRACE, "Done tokenizing, adding EOF")
        tokens.append(
            Token(
                "EOF",
                None,
                ParseLocation(self.pos_line, self.pos_column, self.filename),
            )
        )
        log.log(log.LEXER, log.DEBUG, "Tokens are %s" % (tokens))
        return tokens


class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def next(self):
        token = self.tokens[self.pos]
        if self.pos < (len(self.tokens) - 1):
            self.pos += 1
        log.log(log.PARSER, log.TRACE, "Read %s" % (token))
        return token

    def peek(self):
        token = self.tokens[self.pos]
        log.log(log.PARSER, log.TRACE, "Peeked %s" % (token))
        return token

    def eof(self):
        return self.tokens[self.pos].type == "EOF"

    def create_context(self, context, text):
        token = self.tokens[self.pos]
        return ParseContext(context, text, token.location)

    def parse_version(self, context):
        log.log(log.PARSER, log.TRACE, "Parsing version identifier...")
        context = self.create_context(context, "parsing version identifier")
        token = self.next()
        if token.type != "keyword" or token.value != "NewLang":
            raise ParseError(
                context, "Expected 'NewLang' keyword, got '%s'" % (token.value)
            )
        token = self.next()
        version = token.value
        if version != "0":
            raise ParseError(context, "Unknown version '%s'" % (version))
        log.log(log.PARSER, log.DEBUG, "Parsed version %s" % (version))
        return version

    def parse_value(self, context, subject, type, value):
        log.log(log.PARSER, log.TRACE, "Parsing value...")
        if type == "symbol":
            ret = ast_types.Reference(value)
        elif type == "text":
            ret = ast_types.Text(value)
        elif type == "bool":
            ret = ast_types.Bool(value)
        else:
            raise ParseError(context, "Unexpected value type %s" % (type))
        log.log(log.PARSER, log.TRACE, "Parsed value, AST is %s" % (ret))
        return ret

    def parse_arguments(self, meta_context, terminator):
        log.log(log.PARSER, log.TRACE, "Parsing arguments until '%s'..." % (terminator))
        context = self.create_context(meta_context, "parsing statement arguments")
        args = []
        arg_num = 1
        while True:
            log.log(log.PARSER, log.TRACE, "Parsing argument %i..." % (arg_num))
            arg_context = self.create_context(
                context, "parsing argument %i" % (arg_num)
            )
            end_context = self.create_context(context, "parsing terminator")
            token = self.next()
            arg_num += 1
            if token.type == "keyword":
                if token.value == terminator:
                    log.log(
                        log.PARSER, log.TRACE, "Parsed arguments, AST is %s" % (args)
                    )
                    return args
                else:
                    raise ParseError(
                        end_context, "Expected %s, got %s" % (terminator, token.value)
                    )
            else:
                arg = self.parse_value(arg_context, "argument", token.type, token.value)
                log.log(log.PARSER, log.TRACE, "Parsed argument %s" % (arg))
                args.append(arg)

    def parse_statement(self, context, terminator, type):
        log.log(
            log.PARSER,
            log.TRACE,
            "Parsing %s statement until '%s'..." % (type, terminator),
        )
        meta_context = self.create_context(context, "parsing %s statement" % (type))
        log.log(log.PARSER, log.TRACE, "Parsing statement subject...")
        context = self.create_context(meta_context, "parsing subject")
        token = self.next()
        subject = self.parse_value(context, "subject", token.type, token.value)
        log.log(log.PARSER, log.TRACE, "Parsing statement verb...")
        context = self.create_context(meta_context, "parsing statement verb")
        end_context = self.create_context(context, "parsing terminator")
        token = self.next()
        if token.type == "keyword":
            if token.value == terminator:
                verb = None
            else:
                raise ParseError(
                    end_context, "Expected %s, got %s" % (terminator, token.value)
                )
        elif token.type == "symbol":
            verb = token.value
        else:
            raise ParseError(context, "Expected symbol, got %s" % (token.type))
        log.log(log.PARSER, log.TRACE, "Parsing statement arguments...")
        if verb:
            arguments = self.parse_arguments(meta_context, terminator)
        else:
            arguments = []
        statement = ast_types.Statement(subject, verb, arguments)
        log.log(log.PARSER, log.DEBUG, "Parsed statement, AST is %s" % (statement))
        return statement

    def parse_set(self, context):
        log.log(log.PARSER, log.TRACE, "Parsing set directive...")
        meta_context = self.create_context(context, "parsing set directive")
        self.next()  # Skip 'Set'
        log.log(log.PARSER, log.TRACE, "Parsing set subject...")
        context = self.create_context(meta_context, "parsing subject")
        token = self.next()
        if token.type != "symbol":
            raise ParseError(context, "Expected symbol, got %s" % (token.type))
        subject = token.value
        log.log(log.PARSER, log.TRACE, "Parsing set separator...")
        context = self.create_context(meta_context, "parsing set separator")
        token = self.next()
        if token.type != "keyword" or token.value != "To":
            pretty_value = token.value
            if token.type != "keyword":
                pretty_value = "'%s'" % (pretty_value)
            raise ParseError(context, "Expected To, got %s" % (pretty_value))
        log.log(log.PARSER, log.TRACE, "Parsing set value...")
        ast = self.parse_statement(meta_context, "EndSet", "set value")
        set = ast_types.Set(subject, ast)
        log.log(log.PARSER, log.DEBUG, "Parsed set, AST is %s" % (set))
        return set

    def parse_if(self, context):
        log.log(log.PARSER, log.TRACE, "Parsing if directive...")
        context = self.create_context(context, "parsing if directive")
        self.next()  # Skip 'If'
        test = self.parse_statement(context, "Then", "test condition")
        log.log(log.PARSER, log.TRACE, "Parsing if success statement...")
        success = self.parse_statement(context, "Else", "success")
        log.log(log.PARSER, log.TRACE, "Parsing if failure statement...")
        failure = self.parse_statement(context, "EndIf", "failure")
        conditional = ast_types.Conditional(test, success, failure)
        log.log(log.PARSER, log.DEBUG, "Parsed if, AST is %s" % (conditional))
        return conditional

    def parse_directive(self, context):
        token = self.peek()
        if token.type != "keyword" and token.type != "symbol" and token.type != "bool":
            raise ParseError(
                context, "Expected keyword, symbol or bool, got %s" % (token.type)
            )
        if token.type == "keyword":
            if token.value == "Set":
                return self.parse_set(context)
            elif token.value == "If":
                return self.parse_if(context)
            else:
                raise ParseError(context, "Unexpected keyword %s" % (token.value))
        else:
            ast = self.parse_statement(context, "Done", "command")
            return ast

    def parse_file(self):
        log.log(log.PARSER, log.TRACE, "Parsing file...")
        ast = []
        self.parse_version(None)
        while not self.eof():
            log.log(log.PARSER, log.TRACE, "Parsing next directive in file...")
            ast.append(self.parse_directive(None))
        log.log(log.PARSER, log.DEBUG, "Parsed file, AST is %s" % (ast))
        return ast


def parse_file(filename):
    try:
        code = open(filename, encoding="utf-8").read()
    except UnicodeDecodeError:
        print("Parse error: %s is not valid UTF-8" % (filename))
        return None
    try:
        tokenizer = Tokenizer(code, filename)
        tokens = tokenizer.tokenize()
        parser = Parser(tokens)
        return parser.parse_file()
    except ParseError as e:
        print("Parse error: %s" % (e.error))
        context = e.context
        while context:
            line = context.location.line
            column = context.location.column
            print("While %s at line %i column %i" % (context.context, line, column))
            context = context.parent
        print("While parsing file %s" % (filename))
        return None