diff --git a/src/tokenize.py b/src/tokenize.py new file mode 100644 index 0000000..8c97b08 --- /dev/null +++ b/src/tokenize.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-only +# Copyright 2022 Jookia + + +def is_whitespace(symbol): + return symbol == " " or symbol == "\t" or symbol == "\n" + + +def split_symbols(input): + if input == "": + return [] + symbols = [] + current = "" + curr_whitespace = is_whitespace(input[0]) + for c in input: + c_whitespace = is_whitespace(c) + if c_whitespace == curr_whitespace: + current += c + else: + symbols.append(current) + current = c + curr_whitespace = c_whitespace + if current != "": + symbols.append(current) + return symbols diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py new file mode 100644 index 0000000..847e362 --- /dev/null +++ b/tests/test_tokenize.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: LGPL-2.1-only +# Copyright 2022 Jookia + +from hypothesis import given +from hypothesis.strategies import ( + booleans, + characters, + composite, + just, + lists, + sampled_from, + text, +) + +from src import tokenize + +# Whitespace that separates lexer words +whitespace = " \n\t" + + +# Draws tokenizer symbol +@composite +def draw_symbol(draw): + return draw(text(alphabet=characters(blacklist_characters=whitespace), min_size=1)) + + +# Draws tokenizer whitespace +@composite +def draw_whitespace(draw): + return "".join(draw(lists(sampled_from(whitespace), min_size=1))) + + +# Generates an alternating list of symbols and whitespace +@composite +def draw_split_list(draw): + output = [] + elements = draw(lists(just(True))) + drawing_whitespace = draw(booleans()) + for _ in elements: + if drawing_whitespace: + strategy = draw_whitespace() + else: + strategy = draw_symbol() + output.append(draw(strategy)) + drawing_whitespace = not drawing_whitespace + return output + + +# Test that we the tokenizer can split symbols properly +@given(draw_split_list()) +def test_tokenizer_split_symbols(split): + input = "".join(split) + assert tokenize.split_symbols(input) == split