diff --git a/docs/syntax.md b/docs/syntax.md index b3fe6c8..1b8004f 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -93,12 +93,13 @@ - U+0009 HORIZONTAL TAB - U+0020 SPACE -New lines are the following code points: +New lines are the following code point sequences: - U+000A LINE FEED - U+000B VERTICAL TAB - U+000C FORM FEED - U+000D CARRIAGE RETURN +- U+000A U+000D CARRIAGE RETURN then FORM FEED - U+0085 NEXT LINE - U+2028 LINE SEPARATOR - U+2029 PARAGRAPH SEPARATOR diff --git a/src/tokenize.py b/src/tokenize.py index ca99da0..3534977 100644 --- a/src/tokenize.py +++ b/src/tokenize.py @@ -15,6 +15,7 @@ "\v", # U+000B VERTICAL TAB "\f", # U+000C FORM FEED "\r", # U+000D CARRIAGE RETURN + "\r\n", # U+000A U+000D CARRIAGE RETURN then LINE FEED "\u0085", # U+0085 NEXT LINE "\u2028", # U+2028 LINE SEPARATOR "\u2029", # U+2029 PARAGRAPH SEPARATOR @@ -31,24 +32,24 @@ if input == "": return [] tokens = [] - current = input[0] - curr_whitespace = is_whitespace(input[0]) + prev = input[0] + buffer = prev location = SyntaxLocation(1, 1, "") - for c in input[1:]: - c_whitespace = is_whitespace(c) - if c_whitespace != curr_whitespace: - # Flush current buffer and switch modes - tokens.append(Syntax(current, location, SyntaxType.TOKEN)) - current = c - curr_whitespace = c_whitespace - elif curr_whitespace: - # Whitespace mode appends each code point - tokens.append(Syntax(current, location, SyntaxType.TOKEN)) - current = c - else: - # Token mode builds the current buffer - current += c - tokens.append(Syntax(current, location, SyntaxType.TOKEN)) + for curr in input[1:]: + curr_space = is_whitespace(curr) + prev_space = is_whitespace(prev) + switching = curr_space != prev_space + crlf = prev == "\r" and curr == "\n" + # Flush if we switch between whitespace and non-whitespace code points + # Flush if we're working with a stream of whitespace + # Don't flush if we're in the middle of a CR LF sequence + flush = switching or (curr_space and not crlf) + if flush: + tokens.append(Syntax(buffer, location, SyntaxType.TOKEN)) + buffer = "" + buffer += curr + prev = curr + tokens.append(Syntax(buffer, location, SyntaxType.TOKEN)) return tokens diff --git a/tests/test_syntax.py b/tests/test_syntax.py index ef32e9a..1935c9a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -90,8 +90,8 @@ " ", # U+0020 SPACE ] -# Values reserved for new line use -valid_newlines = [ +# Single values reserved for new line use +single_newlines = [ "\n", # U+000A LINE FEED "\v", # U+000B VERTICAL TAB "\f", # U+000C FORM FEED @@ -101,14 +101,24 @@ "\u2029", # U+2029 PARAGRAPH SEPARATOR ] +# Multi values reserved for new line use +multi_newlines = [ + "\r\n", # U+000A U+000D CARRIAGE RETURN then LINE FEED +] + +# All values reserved for new line use +valid_newlines = single_newlines + multi_newlines + # Draws an unknown token @composite def draw_token_unknown(draw): - reserved = valid_spaces + valid_newlines + reserved = valid_spaces + single_newlines token = draw(draw_token_random()) chars = characters(blacklist_characters=reserved) value = draw(text(alphabet=chars, min_size=1)) + for v in multi_newlines: + assume(v not in value) assume(value not in ["True", "False"]) assume(value not in keywords) return Syntax(value, token.location, SyntaxType.TOKEN) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index f6275e9..f169fdf 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -31,6 +31,24 @@ return Syntax(token.value, location, SyntaxType.TOKEN) +# Merges \r and \n tokens to \r\n tokens +def merge_crlf(tokens): + if len(tokens) < 2: + return tokens + prev = tokens[0] + merged = [] + for curr in tokens[1:]: + if prev.value == "\r" and curr.value == "\n": + # Don't append the \r + # Set prev to \r\n instead of \n + prev = Syntax("\r\n", prev.location, SyntaxType.TOKEN) + else: + merged.append(prev) + prev = curr + merged.append(prev) + return merged + + # Generates an alternating sequence of unknown or whitespace tokens # intended for splitting in to separate tokens @composite @@ -44,7 +62,8 @@ # Multiple whitespaces get split in to multiple tokens strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) - tokens += draw(lists(locationed, min_size=1)) + spaces = draw(lists(locationed, min_size=1)) + tokens += merge_crlf(spaces) else: strategy = draw_token_unknown locationed = draw_token_splitted(strategy()) @@ -101,11 +120,12 @@ # Test that the tokenizer can determine locations # We expect the following behaviour: -# - New line tokens are tokens with one of the following Unicode values: +# - New line tokens are tokens with one of the following Unicode sequences: # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN +# U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0085 NEXT LINE # U+2028 LINE SEPARATOR # U+2029 PARAGRAPH SEPARATOR @@ -139,12 +159,13 @@ # Test that the tokenizer can strip whitespace correctly # We expect the following behaviour: # - No syntax is modified -# - Tokens with the following following values are removed from the output: +# - Tokens with the following values are removed from the output: # U+0009 HORIZONTAL TAB # U+000A LINE FEED # U+000B VERTICAL TAB # U+000C FORM FEED # U+000D CARRIAGE RETURN +# U+000D U+000A CARRIAGE RETURN then LINE FEED # U+0020 SPACE # U+0085 NEXT LINE # U+2028 LINE SEPARATOR