diff --git a/src/tokenize.py b/src/tokenize.py index c76378f..c9a0210 100644 --- a/src/tokenize.py +++ b/src/tokenize.py @@ -4,9 +4,9 @@ import enum -# Checks whether a symbol is whitespace -def is_whitespace(symbol): - return symbol == " " or symbol == "\t" or symbol == "\n" +# Checks whether a symbol is space +def is_space(symbol): + return symbol == " " or symbol == "\t" # Location of a token @@ -34,7 +34,8 @@ # The type of a token class TokenType(enum.Enum): UNKNOWN = enum.auto() # pragma: no mutate - WHITESPACE = enum.auto() # pragma: no mutate + SPACE = enum.auto() # pragma: no mutate + NEWLINE = enum.auto() # pragma: no mutate # Represents a tokenizer token @@ -59,17 +60,17 @@ ) -# Splits text in to a list of characters and whitespace +# Splits text in to a list of characters and space def split_tokens(input): if input == "": return [] tokens = [] current = input[0] - curr_whitespace = is_whitespace(input[0]) + curr_whitespace = is_space(input[0]) or input[0] == "\n" location = TokenLocation(1, 1, "") type = TokenType.UNKNOWN for c in input[1:]: - c_whitespace = is_whitespace(c) + c_whitespace = is_space(c) or c == "\n" if c_whitespace != curr_whitespace: # Flush current buffer and switch modes tokens.append(Token(current, location, type)) @@ -107,8 +108,10 @@ def classify_tokens(tokens): new_tokens = [] for t in tokens: - if is_whitespace(t.value): - type = TokenType.WHITESPACE + if t.value == "\n": + type = TokenType.NEWLINE + elif is_space(t.value): + type = TokenType.SPACE else: type = TokenType.UNKNOWN new = Token(t.value, t.location, type) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 83d7b25..96ad86c 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -16,9 +16,6 @@ from src import tokenize -# Whitespace that separates lexer words -whitespace = " \n\t" - # Draws a random token location @composite @@ -44,22 +41,33 @@ return tokenize.Token(value, location, type) -# Draws a non-whitespace token +# Draws an unknown token @composite -def draw_token_nonwhitespace(draw): +def draw_token_unknown(draw): + reserved = " \n\t" token = draw(draw_token_random()) - chars = characters(blacklist_characters=whitespace) + chars = characters(blacklist_characters=reserved) value = draw(text(alphabet=chars, min_size=1)) type = tokenize.TokenType.UNKNOWN return tokenize.Token(value, token.location, type) -# Draws a whitespace token +# Draws a space token @composite -def draw_token_whitespace(draw): +def draw_token_space(draw): + space = " \t" token = draw(draw_token_random()) - value = draw(sampled_from(whitespace)) - type = tokenize.TokenType.WHITESPACE + value = draw(sampled_from(space)) + type = tokenize.TokenType.SPACE + return tokenize.Token(value, token.location, type) + + +# Draws a new line token +@composite +def draw_token_newline(draw): + token = draw(draw_token_random()) + value = "\n" + type = tokenize.TokenType.NEWLINE return tokenize.Token(value, token.location, type) @@ -106,13 +114,13 @@ # Draws a token with the values split_tokens outputs @composite def draw_token_splitted(draw, strategy): - token = draw(strategy()) + token = draw(strategy) location = tokenize.TokenLocation(1, 1, "") type = tokenize.TokenType.UNKNOWN return tokenize.Token(token.value, location, type) -# Generates an alternating sequence of whitespace and non-whitespace +# Generates an alternating sequence of unknown or whitespace tokens @composite def draw_tokens_list(draw): output = [] @@ -120,12 +128,12 @@ drawing_whitespace = draw(booleans()) for _ in elements: if drawing_whitespace: - strategy = draw_token_whitespace + strategy = one_of([draw_token_space(), draw_token_newline()]) locationed = draw_token_splitted(strategy) output += draw(lists(locationed, min_size=1)) else: - strategy = draw_token_nonwhitespace - locationed = draw_token_splitted(strategy) + strategy = draw_token_unknown + locationed = draw_token_splitted(strategy()) output.append(draw(locationed)) drawing_whitespace = not drawing_whitespace return output @@ -176,8 +184,9 @@ @composite def draw_tokens_classified(draw): strategies = [ - draw_token_nonwhitespace(), - draw_token_whitespace(), + draw_token_unknown(), + draw_token_space(), + draw_token_newline(), ] tokens = draw(lists(one_of(strategies))) input = [] @@ -198,8 +207,9 @@ @composite def draw_source_fuzz(draw): strategies = [ - draw_token_nonwhitespace(), - draw_token_whitespace(), + draw_token_unknown(), + draw_token_space(), + draw_token_newline(), ] tokens = draw(lists(one_of(strategies))) input = ""