Skip to content

Commit

Permalink
Merge pull request #637 from dianna-ai/633-tokenizer-tests
Browse files Browse the repository at this point in the history
added tests for the 'tokenizer problem'
  • Loading branch information
WillemSpek authored Sep 14, 2023
2 parents 3284afe + cd3d058 commit 7ed1f9c
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions tests/test_lime.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,21 @@ def tokenizer():
('UNKWORDZ a bad UNKWORDZ UNKWORDZ!?\'"', 9),
('such UNKWORDZ UNKWORDZ movie "UNKWORDZUNKWORDZ\'UNKWORDZ', 9),
('such a bad UNKWORDZ UNKWORDZ!UNKWORDZ\'UNKWORDZ', 9),
pytest.param('its own self-UNKWORDZ universe.', 7,
marks=pytest.mark.xfail(reason='poor handling of -')),
pytest.param('its own UNKWORDZ-contained universe.', 7,
marks=pytest.mark.xfail(reason='poor handling of -')),
pytest.param('Backslashes are UNKWORDZ/cool.', 6,
marks=pytest.mark.xfail(reason='/ poor handling of /')),
pytest.param('Backslashes are fun/UNKWORDZ.', 6,
marks=pytest.mark.xfail(reason='poor handling of /')),
pytest.param(' ', 0,
marks=pytest.mark.xfail(reason='Repeated whitespaces')),
pytest.param('I like whitespaces.', 4,
marks=pytest.mark.xfail(reason='Repeated whitespaces')),
])


def test_spacytokenizer_length(text, length, tokenizer):
"""Test that tokenizer returns strings of the correct length."""
tokens = tokenizer.tokenize(text)
Expand Down

0 comments on commit 7ed1f9c

Please sign in to comment.