diff --git a/src/parse.py b/src/parse.py index ace60f4..c2754be 100644 --- a/src/parse.py +++ b/src/parse.py @@ -1,35 +1,7 @@ # SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia -import enum - - -# The type of syntax -class SyntaxType(enum.Enum): - TOKEN = enum.auto() # pragma: no mutate - TEXT = enum.auto() # pragma: no mutate - - -# Represents a syntax node -class Syntax: - def __init__(self, value, location, type): - self.value = value - self.location = location - self.type = type - - def __repr__(self): - return "Syntax(value %s, location %s, type %s)" % ( # pragma: no mutate - repr(self.value), - repr(self.location), - str(self.type), - ) - - def __eq__(self, other): - return ( - self.type == other.type - and self.value == other.value - and self.location == other.location - ) +from src import tokenize # Represents a stream of consumable syntax nodes @@ -47,19 +19,11 @@ return None -# Converts tokens to syntax -def import_tokens(tokens): - output = [] - for t in tokens: - output.append(Syntax(t.value, t.location, SyntaxType.TOKEN)) - return output - - # Removes whitespace syntax tokens def strip_whitespace(syntax): output = [] for s in syntax: - if s.type != SyntaxType.TOKEN or s.value not in ["\n", " "]: + if s.type != tokenize.SyntaxType.TOKEN or s.value not in ["\n", " "]: output.append(s) return output @@ -72,7 +36,7 @@ # Error if there's not a valid StartText token if s is None: return None - elif s.type != SyntaxType.TOKEN: + elif s.type != tokenize.SyntaxType.TOKEN: return None elif s.value != "StartText": return None @@ -84,7 +48,7 @@ if s is None: return None # Error if any of the text isn't a token - elif s.type != SyntaxType.TOKEN: + elif s.type != tokenize.SyntaxType.TOKEN: return None # Don't allow StartText in text elif s.value in ["StartText"]: @@ -95,12 +59,11 @@ else: buffer += s.value value = buffer.strip("\n\t ") - type = SyntaxType.TEXT - return Syntax(value, location, type) + type = tokenize.SyntaxType.TEXT + return tokenize.Syntax(value, location, type) # Parses tokens def parse(tokens): - converted = import_tokens(tokens) - stripped = strip_whitespace(converted) + stripped = strip_whitespace(tokens) return stripped diff --git a/src/tokenize.py b/src/tokenize.py index dcd2596..5af370e 100644 --- a/src/tokenize.py +++ b/src/tokenize.py @@ -1,6 +1,58 @@ # SPDX-License-Identifier: LGPL-2.1-only # Copyright 2022 Jookia +import enum + + +# The type of syntax +class SyntaxType(enum.Enum): + TOKEN = enum.auto() # pragma: no mutate + TEXT = enum.auto() # pragma: no mutate + + +# Represents a syntax node +class Syntax: + def __init__(self, value, location, type): + self.value = value + self.location = location + self.type = type + + def __repr__(self): + return "Syntax(value %s, location %s, type %s)" % ( # pragma: no mutate + repr(self.value), + repr(self.location), + str(self.type), + ) + + def __eq__(self, other): + return ( + self.type == other.type + and self.value == other.value + and self.location == other.location + ) + + +# Location of a syntax node +class SyntaxLocation: + def __init__(self, line, column, file): + self.line = line + self.column = column + self.file = file + + def __repr__(self): + return "SyntaxLocation(line %i, column %i, file '%s')" % ( # pragma: no mutate + self.line, + self.column, + self.file, + ) + + def __eq__(self, other): + return ( + self.line == other.line + and self.column == other.column + and self.file == other.file + ) + # Checks whether a symbol is space def is_space(symbol): @@ -17,44 +69,6 @@ return is_space(symbol) or is_newline(symbol) -# Location of a token -class TokenLocation: - def __init__(self, line, column, file): - self.line = line - self.column = column - self.file = file - - def __repr__(self): - return "TokenLocation(line %i, column %i, file '%s')" % ( # pragma: no mutate - self.line, - self.column, - self.file, - ) - - def __eq__(self, other): - return ( - self.line == other.line - and self.column == other.column - and self.file == other.file - ) - - -# Represents a tokenizer token -class Token: - def __init__(self, value, location): - self.value = value - self.location = location - - def __repr__(self): - return "Token(value %s, location %s)" % ( # pragma: no mutate - repr(self.value), - repr(self.location), - ) - - def __eq__(self, other): - return self.value == other.value and self.location == other.location - - # Splits text in to a list of characters and whitespace def split_tokens(input): if input == "": @@ -62,22 +76,22 @@ tokens = [] current = input[0] curr_whitespace = is_whitespace(input[0]) - location = TokenLocation(1, 1, "") + location = SyntaxLocation(1, 1, "") for c in input[1:]: c_whitespace = is_whitespace(c) if c_whitespace != curr_whitespace: # Flush current buffer and switch modes - tokens.append(Token(current, location)) + tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c curr_whitespace = c_whitespace elif curr_whitespace: # Whitespace mode appends each character - tokens.append(Token(current, location)) + tokens.append(Syntax(current, location, SyntaxType.TOKEN)) current = c else: # Token mode builds the current buffer current += c - tokens.append(Token(current, location)) + tokens.append(Syntax(current, location, SyntaxType.TOKEN)) return tokens @@ -105,8 +119,8 @@ line = 1 column = 1 for t in tokens: - location = TokenLocation(line, column, filename) - new = Token(t.value, location) + location = SyntaxLocation(line, column, filename) + new = Syntax(t.value, location, SyntaxType.TOKEN) new_tokens.append(new) if is_newline(t.value): line = line + 1 diff --git a/tests/test_parse.py b/tests/test_parse.py index 0e90a28..8b9890e 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -12,38 +12,32 @@ sampled_from, ) -from src import parse +from src import parse, tokenize from tests import test_tokenize -# Draws a random syntax location -@composite -def draw_syntax_location(draw): - return draw(test_tokenize.draw_token_location()) - - # Draws a random syntax type @composite def draw_syntax_type(draw): - return draw(sampled_from(list(parse.SyntaxType))) + return draw(sampled_from(list(tokenize.SyntaxType))) # Draws a token syntax value @composite def draw_syntax_token(draw): value = draw(test_tokenize.draw_token_classified()) - location = draw(draw_syntax_location()) - type = parse.SyntaxType.TOKEN - return parse.Syntax(value.value, location, type) + location = draw(test_tokenize.draw_token_location()) + type = tokenize.SyntaxType.TOKEN + return tokenize.Syntax(value.value, location, type) # Draws a text syntax value @composite def draw_syntax_text(draw): value = draw(text()) - location = draw(draw_syntax_location()) - type = parse.SyntaxType.TEXT - return parse.Syntax(value, location, type) + location = draw(test_tokenize.draw_token_location()) + type = tokenize.SyntaxType.TEXT + return tokenize.Syntax(value, location, type) # Draws a random syntax @@ -57,10 +51,10 @@ # Test syntax getters -@given(text(), draw_syntax_location(), draw_syntax_type()) +@given(text(), test_tokenize.draw_token_location(), draw_syntax_type()) def test_parse_syntax_getters(value, location, type): # Use text as a somewhat random value - test = parse.Syntax(value, location, type) + test = tokenize.Syntax(value, location, type) assert test.value == value assert test.location == location assert test.type == type @@ -99,29 +93,17 @@ input = draw(lists(test_tokenize.draw_token_random())) tokens = [] for t in input: - tokens.append(parse.Syntax(t.value, t.location, parse.SyntaxType.TOKEN)) + tokens.append(tokenize.Syntax(t.value, t.location, tokenize.SyntaxType.TOKEN)) return (input, tokens) -# Tests importing tokens works correctly -# We expect the following behaviour: -# - Each token is converted to a Syntax -# - The Syntax's value is the token -# - The Syntax's location is the token location -# - The Syntax's type is SyntaxType.TOKEN -@given(draw_syntax_imported()) -def test_parse_import_tokens(test_data): - (input, syntax) = test_data - assert parse.import_tokens(input) == syntax - - # Draws syntax and a syntax without whitespace in it @composite def draw_syntax_whitespace(draw): input = draw(lists(draw_syntax_random())) syntax = [] for s in input: - if s.type != parse.SyntaxType.TOKEN or s.value not in ["\n", " "]: + if s.type != tokenize.SyntaxType.TOKEN or s.value not in ["\n", " "]: syntax.append(s) return (input, syntax) @@ -153,17 +135,17 @@ value += token.value s_value = draw(test_tokenize.draw_token_keyword()) s_value = "StartText" - s_location = draw(draw_syntax_location()) - s_type = parse.SyntaxType.TOKEN - start = parse.Syntax(s_value, s_location, s_type) + s_location = draw(test_tokenize.draw_token_location()) + s_type = tokenize.SyntaxType.TOKEN + start = tokenize.Syntax(s_value, s_location, s_type) e_value = draw(test_tokenize.draw_token_keyword()) e_value = "EndText" - e_location = draw(draw_syntax_location()) - e_type = parse.SyntaxType.TOKEN - end = parse.Syntax(e_value, e_location, e_type) + e_location = draw(test_tokenize.draw_token_location()) + e_type = tokenize.SyntaxType.TOKEN + end = tokenize.Syntax(e_value, e_location, e_type) all_tokens = [start] + tokens + [end] text_value = value.strip("\n\t ") - result = parse.Syntax(text_value, s_location, parse.SyntaxType.TEXT) + result = tokenize.Syntax(text_value, s_location, tokenize.SyntaxType.TEXT) return (all_tokens, result) @@ -197,7 +179,7 @@ if draw(booleans()): token = draw(draw_syntax_random()) assume( - not (token.type == parse.SyntaxType.TOKEN and token.value == "StartText") + not (token.type == tokenize.SyntaxType.TOKEN and token.value == "StartText") ) new_tokens = [token] + tokens[1:0] return new_tokens @@ -212,7 +194,7 @@ def draw_syntax_text_invalid_invalidcontent(draw): (tokens, _) = draw(draw_syntax_text_valid()) token = draw(draw_syntax_random()) - assume(token.type != parse.SyntaxType.TOKEN) + assume(token.type != tokenize.SyntaxType.TOKEN) pos = draw(integers(min_value=1, max_value=(len(tokens) - 1))) new_tokens = tokens[0:pos] + [token] + tokens[pos:] return new_tokens @@ -226,9 +208,9 @@ (tokens, _) = draw(draw_syntax_text_valid()) s_value = draw(test_tokenize.draw_token_keyword()) s_value = "StartText" - s_location = draw(draw_syntax_location()) - s_type = parse.SyntaxType.TOKEN - start = parse.Syntax(s_value, s_location, s_type) + s_location = draw(test_tokenize.draw_token_location()) + s_type = tokenize.SyntaxType.TOKEN + start = tokenize.Syntax(s_value, s_location, s_type) pos = draw(integers(min_value=1, max_value=(len(tokens) - 1))) new_tokens = tokens[0:pos] + [start] + tokens[pos:] return new_tokens @@ -269,7 +251,6 @@ # - Whitespace tokens are stripped @given(lists(test_tokenize.draw_token_classified())) def test_parse_fuzz(tokens): - converted = parse.import_tokens(tokens) - stripped = parse.strip_whitespace(converted) + stripped = parse.strip_whitespace(tokens) parsed = parse.parse(tokens) assert stripped == parsed diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 654c3e5..b68baa0 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -41,7 +41,7 @@ line = draw(integers()) column = draw(integers()) filename = draw(text()) - return tokenize.TokenLocation(line, column, filename) + return tokenize.SyntaxLocation(line, column, filename) # Draws a random token @@ -49,7 +49,7 @@ def draw_token_random(draw): value = draw(text()) location = draw(draw_token_location()) - return tokenize.Token(value, location) + return tokenize.Syntax(value, location, tokenize.SyntaxType.TOKEN) # Draws an unknown token @@ -62,7 +62,7 @@ assume(value not in ["True", "False"]) assume(value not in keywords) assume(value[0:2] != "#!") - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a space token @@ -71,7 +71,7 @@ space = " \t" token = draw(draw_token_random()) value = draw(sampled_from(space)) - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a new line token @@ -79,7 +79,7 @@ def draw_token_newline(draw): token = draw(draw_token_random()) value = "\n" - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a bool token @@ -90,7 +90,7 @@ value = "True" else: value = "False" - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a keyword token @@ -98,7 +98,7 @@ def draw_token_keyword(draw): token = draw(draw_token_random()) value = draw(sampled_from(keywords)) - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a shebang token @@ -106,7 +106,7 @@ def draw_token_shebang(draw): token = draw(draw_token_random()) value = "#!" + draw(text()) - return tokenize.Token(value, token.location) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draws a classified token @@ -127,7 +127,7 @@ # Test location getters @given(integers(), integers(), text()) def test_tokenize_location_getters(line, column, filename): - test = tokenize.TokenLocation(line, column, filename) + test = tokenize.SyntaxLocation(line, column, filename) assert test.line == line assert test.column == column assert test.file == filename @@ -147,7 +147,7 @@ # Test token getters @given(text(), draw_token_location()) def test_tokenize_token_getters(value, location): - test = tokenize.Token(value, location) + test = tokenize.Syntax(value, location, tokenize.SyntaxType.TOKEN) assert test.value == value assert test.location == location @@ -163,8 +163,8 @@ @composite def draw_token_splitted(draw, strategy): token = draw(strategy) - location = tokenize.TokenLocation(1, 1, "") - return tokenize.Token(token.value, location) + location = tokenize.SyntaxLocation(1, 1, "") + return tokenize.Syntax(token.value, location, tokenize.SyntaxType.TOKEN) # Generates an alternating sequence of unknown or whitespace tokens @@ -214,8 +214,8 @@ line = 1 column = 1 for t in tokens: - location = tokenize.TokenLocation(line, column, filename) - new = tokenize.Token(t.value, location) + location = tokenize.SyntaxLocation(line, column, filename) + new = tokenize.Syntax(t.value, location, tokenize.SyntaxType.TOKEN) located.append(new) if t.value == "\n": line = line + 1 @@ -251,10 +251,7 @@ value = draw(text(min_size=1)) + value if draw(booleans()): value = value + draw(text(min_size=1)) - return tokenize.Token( - value, - token.location, - ) + return tokenize.Syntax(value, token.location, tokenize.SyntaxType.TOKEN) # Draw a random string made of token values