diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml new file mode 100644 index 0000000..3130fbf --- /dev/null +++ b/.github/workflows/publish_pypi.yml @@ -0,0 +1,27 @@ +name: Publish Python Package + +on: + release: + types: [published, edited] + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python setup.py sdist bdist_wheel + twine upload --verbose --skip-existing dist/* diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c2f1247 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,14 @@ +# Include the README +include *.md + +# Include the license file +include LICENSE + +# Include setup.py +include setup.py + +# Include requirements.txt +include requirements.txt + +# Include the data files +recursive-include safetext/languages * diff --git a/README.md b/README.md index 7941d95..eff6e02 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,24 @@ # safetext Rule-based profanity checking tool for English and Turkish. + +### installation + +```bash +pip install safetext +``` + +### usage + +```python +from safetext import SafeText + +st = SafeText(language='en') + +results = st.check_profanity(text='Some text with .') +>> results +>> {'word': '', 'index': 4, 'start': 15, 'end': 31} + +text = st.censor_profanity(text='Some text with .') +>> text +>> "Some text with ***." +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7bc9550 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +zemberek-python>=0.2.3, <0.3.0 diff --git a/safetext/__init__.py b/safetext/__init__.py new file mode 100644 index 0000000..2a2249e --- /dev/null +++ b/safetext/__init__.py @@ -0,0 +1,30 @@ +from .languages.en import EnglishProfanityChecker +from .languages.tr import TurkishProfanityChecker + +__version__ = "0.0.1" + + +class SafeText: + def __init__(self, language="en"): + self.language = language + self.checker = None + self.set_language(language) + + def set_language(self, language): + self.language = language + if language == "en": + self.checker = EnglishProfanityChecker() + elif language == "tr": + self.checker = TurkishProfanityChecker() + else: + raise ValueError("Language not supported") + + def check_profanity(self, text): + if self.checker is None: + raise ValueError("Language not set") + return self.checker.check(text) + + def censor_profanity(self, text): + if self.checker is None: + raise ValueError("Language not set") + return self.checker.censor(text) diff --git a/safetext/languages/__init__.py b/safetext/languages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/safetext/languages/base.py b/safetext/languages/base.py new file mode 100644 index 0000000..4073cdc --- /dev/null +++ b/safetext/languages/base.py @@ -0,0 +1,69 @@ +class BaseProfanityChecker: + """Base class for profanity checkers.""" + + def __init__(self, language): + self.language = language + + @property + def words_filepath(self): + """Get the filepath for the profanity words file.""" + import pathlib + + return f"{pathlib.Path(__file__).parent.resolve()}/{self.language}/words.txt" + + @property + def profanity_words(self): + """Get the profanity words for the language.""" + if not hasattr(self, "_profanity_words"): + self._profanity_words = self._read_words(self.words_filepath) + + return self._profanity_words + + def _check(self, text): + """Check the text for profanity.""" + # Split the text into a list of words + words = text.split() + + # Initialize a list to store the indices of profanity words + profanity_infos = [] + + for i, word in enumerate(words): + if word.lower() in self.profanity_words: + start_index = sum( + len(w) + 1 for w in words[:i] + ) # +1 to account for space between words + end_index = start_index + len(word) + profanity_info = { + "word": word, + "index": i + 1, + "start": start_index, + "end": end_index, + } + profanity_infos.append(profanity_info) + + return profanity_infos + + def _read_words(self, filepath): + """Read the profanity words from the given file.""" + with open(filepath, "r") as f: + profanity_words = f.read().splitlines() + + return profanity_words + + def _preprocess(self, text): + """Preprocess the text before checking for profanity.""" + return text + + def check(self, text): + """Check the text for profanity.""" + return self._check(self._preprocess(text)) + + def censor(self, text): + """Censor the text.""" + detected_profanities = self.check(text) + for profanity in detected_profanities: + start_index = profanity["start"] + end_index = profanity["end"] + text = text.replace(text[start_index:end_index], "***") + + return text diff --git a/safetext/languages/en/__init__.py b/safetext/languages/en/__init__.py new file mode 100644 index 0000000..1767f57 --- /dev/null +++ b/safetext/languages/en/__init__.py @@ -0,0 +1,8 @@ +from safetext.languages.base import BaseProfanityChecker + + +class EnglishProfanityChecker(BaseProfanityChecker): + """Turkish profanity checker.""" + + def __init__(self): + super().__init__(language="en") diff --git a/safetext/languages/en/words.txt b/safetext/languages/en/words.txt new file mode 100644 index 0000000..e5c4f91 --- /dev/null +++ b/safetext/languages/en/words.txt @@ -0,0 +1,74 @@ +anal +anus +arse +ass +ass fuck +ass hole +assfucker +asshole +assshole +bastard +bitch +black cock +bloody hell +boong +cock +cockfucker +cocksuck +cocksucker +coon +coonnass +crap +cunt +cyberfuck +damn +darn +dick +dirty +douche +dummy +erect +erection +erotic +escort +fag +faggot +fuck +Fuck off +fuck you +fuckass +fuckhole +god damn +gook +hard core +hardcore +homoerotic +hore +lesbian +lesbians +mother fucker +motherfuck +motherfucker +negro +nigger +orgasim +orgasm +penis +penisfucker +piss +piss off +porn +porno +pornography +pussy +retard +sadist +sex +sexy +shit +slut +son of a bitch +suck +tits +viagra +whore \ No newline at end of file diff --git a/safetext/languages/tr/__init__.py b/safetext/languages/tr/__init__.py new file mode 100644 index 0000000..070005e --- /dev/null +++ b/safetext/languages/tr/__init__.py @@ -0,0 +1,8 @@ +from safetext.languages.base import BaseProfanityChecker + + +class TurkishProfanityChecker(BaseProfanityChecker): + """Turkish profanity checker.""" + + def __init__(self): + super().__init__(language="tr") diff --git a/safetext/languages/tr/words.txt b/safetext/languages/tr/words.txt new file mode 100644 index 0000000..88362da --- /dev/null +++ b/safetext/languages/tr/words.txt @@ -0,0 +1,363 @@ +am +amı +ambiti +amcığı +amcığın +amcığını +amcığınızı +amcık +amcık hoşafı +amcıklama +amcıklandı +amcik +amcuk +amık +amına +amınako +amınakoyim +amın +amını +amınoğlu +amısına +amısını +aminako +aminakoyarim +aminakoyim +aminda +amindan +amindayken +amini +aminiyarraaniskiim +aminoglu +amiyum +amk +amkafa +amlarnzn +amlı +amq +amsız +amsiz +amteri +amuğa +amuna +anal +ananınki +ananısikerim +ananısikeyim +ananızın +ananisikerim +ananisikeyim +aq +a.q +a.q. +aq. +atmık +attırdığım +azdım +azdır +azdırıcı +bitch +boner +bosalmak +boşalmak +cenabet +cibilliyetini +çük +daltassak +dalyarak +dalyarrak +dassagi +dildo +domal +domalan +domaldı +domaldın +domalık +domalıyor +domalmak +domalmış +domalsın +domalt +domaltarak +domaltıp +domaltır +domaltırım +domaltip +domaltmak +dölü +ebeninki +fahise +fahişe +gavad +gavat +giberim +giberler +gibis +gibiş +gibmek +gibtiler +goddamn +godoş +godumun +gotelek +gotlalesi +gotlu +gotten +gotundeki +gotunden +gotune +gotunu +gotveren +goyiim +goyum +goyuyim +goyyim +göt +götelek +götlalesi +götlek +götoğlanı +götoş +götten +götü +götün +götüne +götünekoyim +götünü +götveren +hasiktir +hassikome +hassiktir +hassittir +ıbnelık +ibine +ibinenin +ibne +ibnedir +ibneleri +ibnelik +ibnelri +ibneni +ibnenin +ibnerator +ibnesi +ipne +itoğlu +kaltak +kancık +kancik +karhane +kaşar +kavat +kavatn +kerane +kerhane +kerhanelerde +kevase +kevaşe +koduğmun +koduğmunun +kodumun +kodumunun +koduumun +madafaka +malafat +meme +memelerini +memesini +memeleri +memeli +mincikliyim +mıncıklıyım +mıncıklayayım +mıncıkladım +mıncıklandım +mıncıkladı +mıncıkla +motherfucker +oğlancı +orosbucocuu +orospu +orospucocugu +orospuçocuğu +orospudur +orospular +orospunun +orospuydu +orospuyuz +orostoban +orostopol +orrospu +oruspu +oruspuçocuğu +osbir +otuzbir +penis +pezevek +pezeveng +pezevengi +pezevengin +pezevenk +pezo +picler +piçin +piç +piçler +pipi +pipiş +pisliktir +porno +pussy +puşt +puşttur +s1kerim +s1kerm +s1krm +sakso +saxo +serefsiz +sevişelim +sıçarım +sıçayım +sıçmış +sıçmışsın +sıçtığım +sicarsin +sik +sikdi +sikdiğim +sike +sikecem +sikem +siken +sikenin +siker +sikerim +sikerler +sikersin +sikertir +sikertmek +sikesicenin +sikey +sikeydim +sikeyim +sikeym +siki +sikicem +sikici +sikien +sikienler +sikiiim +sikiiimmm +sikiim +sikiir +sikiirken +sikik +sikil +sikildiini +sikilesice +sikilmi +sikilmie +sikilmis +sikilmiş +sikilsin +sikim +sikimde +sikimden +sikime +sikimi +sikimiin +sikimin +sikimle +sikimsonik +sikimtrak +sikin +sikinde +sikinden +sikine +sikini +sikip +sikis +sikisek +sikisen +sikismis +sikiş +sikişen +sikişme +sikiyim +sikiyorum +sikkim +sikko +sikleri +sikli +sikmek +sikmem +sikmiler +sikmisligim +siksem +sikseydin +sikseyidin +siksin +siksinler +siksiz +siksok +sikti +siktigimin +siktigiminin +siktiğim +siktiğimin +siktiğiminin +siktiim +siktiimin +siktiler +siktim +siktimin +siktir +siktirgit +siktirir +siktiririm +siktiriyor +siktirolgit +siktir +sittir +skcem +skecem +skem +sker +skerim +skerm +skeyim +sktiimin +sktrr +skyim +soktuğumunun +sürtük +şerefsiz +şıllık +tasak +tassak +taşak +taşşak +totoş +yarak +yaraksız +yaraminbasi +yarraak +yarraam +yarragi +yarragimi +yarragina +yarragindan +yarragm +yarrağ +yarrağım +yarrağımı +yarraimin +yarrak +yarrrak +yrrak +zikeyim +zikiiim +zikiim +zikik +zikim \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b4ec040 --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +import io +import os +import re + +import setuptools + + +def get_long_description(): + base_dir = os.path.abspath(os.path.dirname(__file__)) + with io.open(os.path.join(base_dir, "README.md"), encoding="utf-8") as f: + return f.read() + + +def get_requirements(): + with open("requirements.txt") as f: + return f.read().splitlines() + + +def get_version(): + current_dir = os.path.abspath(os.path.dirname(__file__)) + version_file = os.path.join(current_dir, "safetext", "__init__.py") + with io.open(version_file, encoding="utf-8") as f: + return re.search(r'^__version__ = [\'"]([^\'"]*)[\'"]', f.read(), re.M).group(1) + + +_DEV_REQUIREMENTS = [ + "black==21.7b0", + "flake8==3.9.2", + "isort==5.9.2", + "click==8.0.4", + "importlib-metadata>=1.1.0,<4.3;python_version<'3.8'", +] + +extras = {"tests": _DEV_REQUIREMENTS, "dev": _DEV_REQUIREMENTS} + + +setuptools.setup( + name="safetext", + version=get_version(), + author="", + license="MIT", + description="Rule-based profanity checking tool for English and Turkish.", + long_description=get_long_description(), + long_description_content_type="text/markdown", + url="https://github.com/deepsafe/safetext", + packages=setuptools.find_packages(exclude=["tests"]), + python_requires=">=3.7", + install_requires=get_requirements(), + extras_require=extras, + include_package_data=True, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + entry_points={ + "console_scripts": [ + "roboflow2huggingface=roboflow2huggingface.cli:app", + ], + }, + keywords="text, profanity, filtering, turkish, english", +) \ No newline at end of file