diff --git a/docs/syntax.md b/docs/syntax.md index ed17e66..b3fe6c8 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -86,16 +86,29 @@ This is not specified using formal grammar or language. Any ambiguity is accidental. +### Whitespace + +Spaces are the following code points: + +- U+0009 HORIZONTAL TAB +- U+0020 SPACE + +New lines are the following code points: + +- U+000A LINE FEED +- U+000B VERTICAL TAB +- U+000C FORM FEED +- U+000D CARRIAGE RETURN +- U+0085 NEXT LINE +- U+2028 LINE SEPARATOR +- U+2029 PARAGRAPH SEPARATOR + +Both spaces and new lines are treated as whitespace. + ### Tokens Syntax in NewLang is formed using alphanumeric tokens separated by whitespace. -Whitespace can be the following Unicode code points: - -- U+000A LINE FEED -- U+0009 HORIZONTAL TAB -- U+0020 SPACE - For example, the following code snippet: ``` diff --git a/src/tokenize.py b/src/tokenize.py index 882d00e..ca99da0 100644 --- a/src/tokenize.py +++ b/src/tokenize.py @@ -3,23 +3,30 @@ from src.syntax import Syntax, SyntaxLocation, SyntaxType +# Valid space code points +spaces = [ + "\t", # U+0009 HORIZONTAL TAB + " ", # U+0020 SPACE +] -# Checks whether a symbol is space -def is_space(symbol): - return symbol == " " or symbol == "\t" - - -# Checks whether a symbol is a new line -def is_newline(symbol): - return symbol == "\n" +# Valid new line tokens +newlines = [ + "\n", # U+000A LINE FEED + "\v", # U+000B VERTICAL TAB + "\f", # U+000C FORM FEED + "\r", # U+000D CARRIAGE RETURN + "\u0085", # U+0085 NEXT LINE + "\u2028", # U+2028 LINE SEPARATOR + "\u2029", # U+2029 PARAGRAPH SEPARATOR +] # Checks whether a symbol is general whitespace def is_whitespace(symbol): - return is_space(symbol) or is_newline(symbol) + return symbol in spaces or symbol in newlines -# Splits text in non-whitespace and whitespace +# Splits text in to a list of tokens and whitespace def split_tokens(input): if input == "": return [] @@ -54,7 +61,7 @@ location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) new_tokens.append(new) - if is_newline(t.value): + if t.value in newlines: line = line + 1 offset = 1 else: diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 83dc2da..ef32e9a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -84,10 +84,28 @@ return Syntax(value, location, SyntaxType.TOKEN) +# Values considered spaces +valid_spaces = [ + "\t", # U+0009 HORIZONTAL TAB + " ", # U+0020 SPACE +] + +# Values reserved for new line use +valid_newlines = [ + "\n", # U+000A LINE FEED + "\v", # U+000B VERTICAL TAB + "\f", # U+000C FORM FEED + "\r", # U+000D CARRIAGE RETURN + "\u0085", # U+0085 NEXT LINE + "\u2028", # U+2028 LINE SEPARATOR + "\u2029", # U+2029 PARAGRAPH SEPARATOR +] + + # Draws an unknown token @composite def draw_token_unknown(draw): - reserved = " \n\t" + reserved = valid_spaces + valid_newlines token = draw(draw_token_random()) chars = characters(blacklist_characters=reserved) value = draw(text(alphabet=chars, min_size=1)) @@ -99,9 +117,8 @@ # Draws a space token @composite def draw_token_space(draw): - space = " \t" token = draw(draw_token_random()) - value = draw(sampled_from(space)) + value = draw(sampled_from(valid_spaces)) return Syntax(value, token.location, SyntaxType.TOKEN) @@ -109,7 +126,7 @@ @composite def draw_token_newline(draw): token = draw(draw_token_random()) - value = "\n" + value = draw(sampled_from(valid_newlines)) return Syntax(value, token.location, SyntaxType.TOKEN) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 3089c87..f6275e9 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -18,6 +18,8 @@ draw_token_newline, draw_token_space, draw_token_unknown, + valid_spaces, + valid_newlines, ) @@ -55,11 +57,21 @@ # Test that the tokenizer can split tokens properly # We expect the following behaviour: -# - Whitespace is a tab, space or new line +# - Whitespace is any of the following Unicode sequences: +# U+0009 HORIZONTAL TAB +# U+000A LINE FEED +# U+000B VERTICAL TAB +# U+000C FORM FEED +# U+000D CARRIAGE RETURN +# U+000D U+000A CARRIAGE RETURN then LINE FEED +# U+0020 SPACE +# U+0085 NEXT LINE +# U+2028 LINE SEPARATOR +# U+2029 PARAGRAPH SEPARATOR # - Non-whitespace is anything else # - Whitespace and non-whitespace are separated in to tokens -# - Whitespace characters are split in to multiple tokens -# - Non-whitespace characters are combined in to a single token +# - Whitespace sequences are split in to multiple tokens +# - Non-whitespace code points are combined in to a single token # - Each token location is line 1 offset 1 of file "" @given(draw_tokens_to_split()) def test_tokenize_split_tokens(test_data): @@ -79,7 +91,7 @@ location = SyntaxLocation(line, offset, filename) new = Syntax(t.value, location, SyntaxType.TOKEN) located.append(new) - if t.value == "\n": + if t.value in valid_newlines: line = line + 1 offset = 1 else: @@ -89,13 +101,21 @@ # Test that the tokenizer can determine locations # We expect the following behaviour: +# - New line tokens are tokens with one of the following Unicode values: +# U+000A LINE FEED +# U+000B VERTICAL TAB +# U+000C FORM FEED +# U+000D CARRIAGE RETURN +# U+0085 NEXT LINE +# U+2028 LINE SEPARATOR +# U+2029 PARAGRAPH SEPARATOR # - Only the token location is modified # - Each token's location filename is the generated filename -# - A location's line is equal to 1 plus the number of tokens with the -# value '\n' before the token -# - A location's offset is equal to 1 plus the count of previous token's -# value code points 'before' it, where 'before' is defined as any -# token between the last '\n' (or start of file) before the token +# - A token's line is equal to 1 plus the number of new line tokens +# before the token +# - A token's offset is equal to 1 plus the sum of previous token's +# value lengths 'before' it, where 'before' is defined as any +# token between the last new line token (or start of file) before the token # and the token itself @given(draw_tokens_locations()) def test_tokenize_locations(test_data): @@ -109,7 +129,9 @@ input = draw(lists(draw_token_classified())) syntax = [] for s in input: - if s.type != SyntaxType.TOKEN or s.value not in ["\t", "\n", " "]: + if s.type != SyntaxType.TOKEN or ( + s.value not in valid_spaces and s.value not in valid_newlines + ): syntax.append(s) return (input, syntax) @@ -120,7 +142,13 @@ # - Tokens with the following following values are removed from the output: # U+0009 HORIZONTAL TAB # U+000A LINE FEED +# U+000B VERTICAL TAB +# U+000C FORM FEED +# U+000D CARRIAGE RETURN # U+0020 SPACE +# U+0085 NEXT LINE +# U+2028 LINE SEPARATOR +# U+2029 PARAGRAPH SEPARATOR @given(draw_tokens_whitespace()) def test_tokenize_strip_whitespace(test_data): (input, syntax) = test_data