diff --git a/docs/syntax.md b/docs/syntax.md index adf289b..ed17e66 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -176,14 +176,15 @@ 'StartText' is not allowed in text values. This helps catch cases of accidental nesting. -Whitespace after 'StartText' and before 'EndText' are automatically removed. +Whitespace is not preserved in the arbitrary text, instead each word is +joined with a U+0020 SPACE code point. The arbitrary text will be used as Text data. Here's an example: ``` -System Print StartText Hello, world! EndText Done +System Print StartText Hello, world! EndText Done ``` This will print the text 'Hello, world!' diff --git a/src/parse.py b/src/parse.py index 1866b02..8494d4a 100644 --- a/src/parse.py +++ b/src/parse.py @@ -4,15 +4,6 @@ from src.syntax import Syntax, SyntaxType -# Removes whitespace syntax tokens -def strip_whitespace(syntax): - output = [] - for s in syntax: - if s.type != SyntaxType.TOKEN or s.value not in ["\n", " "]: - output.append(s) - return output - - # Reads a token, possibly of a certain value def read_token(stream, value): s = stream.pop() @@ -44,13 +35,12 @@ elif s.value == "EndText": break else: - buffer += s.value - value = buffer.strip("\n\t ") + buffer += s.value + " " type = SyntaxType.TEXT + value = buffer[:-1] # Drop trailing space return Syntax(value, location, type) # Parses tokens def parse(tokens): - stripped = strip_whitespace(tokens) - return stripped + return tokens diff --git a/src/tokenize.py b/src/tokenize.py index 2365a96..882d00e 100644 --- a/src/tokenize.py +++ b/src/tokenize.py @@ -62,8 +62,18 @@ return new_tokens +# Removes whitespace tokens +def strip_whitespace(syntax): + output = [] + for s in syntax: + if s.type == SyntaxType.TOKEN and not is_whitespace(s.value): + output.append(s) + return output + + # Tokenizes source code def tokenize(source, filename): split = split_tokens(source) located = locate_tokens(split, filename) - return located + stripped = strip_whitespace(located) + return stripped diff --git a/tests/test_parse.py b/tests/test_parse.py index 600ac5a..fd08c4a 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -27,27 +27,6 @@ return new_data -# Draws syntax and a syntax without whitespace in it -@composite -def draw_syntax_whitespace(draw): - input = draw(lists(draw_syntax_random())) - syntax = [] - for s in input: - if s.type != SyntaxType.TOKEN or s.value not in ["\n", " "]: - syntax.append(s) - return (input, syntax) - - -# Tests strip_whitespace works correctly -# We expect the following behaviour: -# - No syntax is modified -# - Tokens of value '\n' or ' ' are removed from the output -@given(draw_syntax_whitespace()) -def test_parse_strip_whitespace(test_data): - (input, syntax) = test_data - assert parse.strip_whitespace(input) == syntax - - # Draws a random token suitable for text building @composite def draw_text_value_token(draw): @@ -68,14 +47,14 @@ @composite def draw_syntax_text_valid(draw): tokens = draw(lists(draw_text_value_token())) - value = "" + buffer = "" for token in tokens: - value += token.value + buffer += token.value + " " + value = buffer[:-1] # Drop trailing space start = draw(draw_token_by_value("StartText")) end = draw(draw_token_by_value("EndText")) all_tokens = [start] + tokens + [end] - text_value = value.strip("\n\t ") - result = Syntax(text_value, start.location, SyntaxType.TEXT) + result = Syntax(value, start.location, SyntaxType.TEXT) return (all_tokens, result) @@ -83,7 +62,7 @@ # We expect the following behaviour: # - Only the text expression is parsed # - The resulting text is the value of tokens between StartText and EndText -# - The resulting text has its surrounding whitespace stripped +# - The value of the tokens is joined by U+0020 SPACE code points # - The Syntax's value is the resulting text # - The Syntax's type is SyntaxType.TEXT # - The Syntax's location is the StartText location @@ -170,9 +149,8 @@ # Tests the parser wrapper works correctly # We expect the following behaviour: -# - Whitespace tokens are stripped +# - Nothing happens for now @given(lists(draw_token_classified())) def test_parse_fuzz(tokens): - stripped = parse.strip_whitespace(tokens) parsed = parse.parse(tokens) - assert stripped == parsed + assert tokens == parsed diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index abc9333..3089c87 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -103,6 +103,30 @@ assert tokenize.locate_tokens(input, filename) == located +# Generates a list of tokens with and without whitespace +@composite +def draw_tokens_whitespace(draw): + input = draw(lists(draw_token_classified())) + syntax = [] + for s in input: + if s.type != SyntaxType.TOKEN or s.value not in ["\t", "\n", " "]: + syntax.append(s) + return (input, syntax) + + +# Test that the tokenizer can strip whitespace correctly +# We expect the following behaviour: +# - No syntax is modified +# - Tokens with the following following values are removed from the output: +# U+0009 HORIZONTAL TAB +# U+000A LINE FEED +# U+0020 SPACE +@given(draw_tokens_whitespace()) +def test_tokenize_strip_whitespace(test_data): + (input, syntax) = test_data + assert tokenize.strip_whitespace(input) == syntax + + # Draws a token and possibly add garbage # This is to ensure that tokens must completely match a value @composite @@ -134,5 +158,6 @@ def test_tokenize_fuzz(source, filename): split = tokenize.split_tokens(source) located = tokenize.locate_tokens(split, filename) + stripped = tokenize.strip_whitespace(located) tokenized = tokenize.tokenize(source, filename) - assert located == tokenized + assert stripped == tokenized