From 335ab648205b476684dd28eaf22baacebeb1c5ab Mon Sep 17 00:00:00 2001 From: Johannes Filter Date: Fri, 16 Oct 2020 00:18:06 +0200 Subject: [PATCH] improve phone regex (fix #10) --- cleantext/constants.py | 3 ++- tests/test_clean.py | 25 ++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cleantext/constants.py b/cleantext/constants.py index 0350849..6a12cb4 100644 --- a/cleantext/constants.py +++ b/cleantext/constants.py @@ -42,8 +42,9 @@ flags=re.IGNORECASE | re.UNICODE, ) +# for more information: https://github.com/jfilter/clean-text/issues/10 PHONE_REGEX = re.compile( - r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))" + r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}" ) NUMBERS_REGEX = re.compile( diff --git a/tests/test_clean.py b/tests/test_clean.py index f890146..d6b816e 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -36,10 +36,29 @@ def test_replace_emails(): assert cleantext.replace_emails(text, "*EMAIL*") == proc_text +phone_numbers = [ + "+49 123 1548690", + "555-123-4567", + "2404 9099130", + "024049099130", + "02404 9099130", + "02404/9099130", + "+492404 9099130", + "+4924049099130", + "+492404/9099130", + "0160 123456789", + "0160/123456789", + "+32160 123456789", + "Tel.: 0160 123456789", +] + + def test_replace_phone_numbers(): - text = "I can be reached at 555-123-4567 through next Friday." - proc_text = "I can be reached at *PHONE* through next Friday." - assert cleantext.replace_phone_numbers(text, "*PHONE*") == proc_text + for x in phone_numbers: + x_phone = cleantext.replace_phone_numbers(x, "*PHONE*") + assert "PHONE" in x_phone and not any(map(str.isdigit, x_phone)), ( + x + " / " + x_phone + ) def test_replace_numbers():