-
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from deepsafe/init
init package
- Loading branch information
Showing
12 changed files
with
685 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: Publish Python Package | ||
|
||
on: | ||
release: | ||
types: [published, edited] | ||
|
||
jobs: | ||
publish: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.x' | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install setuptools wheel twine | ||
- name: Build and publish | ||
env: | ||
TWINE_USERNAME: __token__ | ||
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} | ||
run: | | ||
python setup.py sdist bdist_wheel | ||
twine upload --verbose --skip-existing dist/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Include the README | ||
include *.md | ||
|
||
# Include the license file | ||
include LICENSE | ||
|
||
# Include setup.py | ||
include setup.py | ||
|
||
# Include requirements.txt | ||
include requirements.txt | ||
|
||
# Include the data files | ||
recursive-include safetext/languages * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,24 @@ | ||
# safetext | ||
Rule-based profanity checking tool for English and Turkish. | ||
|
||
### installation | ||
|
||
```bash | ||
pip install safetext | ||
``` | ||
|
||
### usage | ||
|
||
```python | ||
from safetext import SafeText | ||
|
||
st = SafeText(language='en') | ||
|
||
results = st.check_profanity(text='Some text with <profanity-word>.') | ||
>> results | ||
>> {'word': '<profanity-word>', 'index': 4, 'start': 15, 'end': 31} | ||
|
||
text = st.censor_profanity(text='Some text with <profanity-word>.') | ||
>> text | ||
>> "Some text with ***." | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
zemberek-python>=0.2.3, <0.3.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from .languages.en import EnglishProfanityChecker | ||
from .languages.tr import TurkishProfanityChecker | ||
|
||
__version__ = "0.0.1" | ||
|
||
|
||
class SafeText: | ||
def __init__(self, language="en"): | ||
self.language = language | ||
self.checker = None | ||
self.set_language(language) | ||
|
||
def set_language(self, language): | ||
self.language = language | ||
if language == "en": | ||
self.checker = EnglishProfanityChecker() | ||
elif language == "tr": | ||
self.checker = TurkishProfanityChecker() | ||
else: | ||
raise ValueError("Language not supported") | ||
|
||
def check_profanity(self, text): | ||
if self.checker is None: | ||
raise ValueError("Language not set") | ||
return self.checker.check(text) | ||
|
||
def censor_profanity(self, text): | ||
if self.checker is None: | ||
raise ValueError("Language not set") | ||
return self.checker.censor(text) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
class BaseProfanityChecker: | ||
"""Base class for profanity checkers.""" | ||
|
||
def __init__(self, language): | ||
self.language = language | ||
|
||
@property | ||
def words_filepath(self): | ||
"""Get the filepath for the profanity words file.""" | ||
import pathlib | ||
|
||
return f"{pathlib.Path(__file__).parent.resolve()}/{self.language}/words.txt" | ||
|
||
@property | ||
def profanity_words(self): | ||
"""Get the profanity words for the language.""" | ||
if not hasattr(self, "_profanity_words"): | ||
self._profanity_words = self._read_words(self.words_filepath) | ||
|
||
return self._profanity_words | ||
|
||
def _check(self, text): | ||
"""Check the text for profanity.""" | ||
# Split the text into a list of words | ||
words = text.split() | ||
|
||
# Initialize a list to store the indices of profanity words | ||
profanity_infos = [] | ||
|
||
for i, word in enumerate(words): | ||
if word.lower() in self.profanity_words: | ||
start_index = sum( | ||
len(w) + 1 for w in words[:i] | ||
) # +1 to account for space between words | ||
end_index = start_index + len(word) | ||
profanity_info = { | ||
"word": word, | ||
"index": i + 1, | ||
"start": start_index, | ||
"end": end_index, | ||
} | ||
profanity_infos.append(profanity_info) | ||
|
||
return profanity_infos | ||
|
||
def _read_words(self, filepath): | ||
"""Read the profanity words from the given file.""" | ||
with open(filepath, "r") as f: | ||
profanity_words = f.read().splitlines() | ||
|
||
return profanity_words | ||
|
||
def _preprocess(self, text): | ||
"""Preprocess the text before checking for profanity.""" | ||
return text | ||
|
||
def check(self, text): | ||
"""Check the text for profanity.""" | ||
return self._check(self._preprocess(text)) | ||
|
||
def censor(self, text): | ||
"""Censor the text.""" | ||
detected_profanities = self.check(text) | ||
for profanity in detected_profanities: | ||
start_index = profanity["start"] | ||
end_index = profanity["end"] | ||
text = text.replace(text[start_index:end_index], "***") | ||
|
||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from safetext.languages.base import BaseProfanityChecker | ||
|
||
|
||
class EnglishProfanityChecker(BaseProfanityChecker): | ||
"""Turkish profanity checker.""" | ||
|
||
def __init__(self): | ||
super().__init__(language="en") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
anal | ||
anus | ||
arse | ||
ass | ||
ass fuck | ||
ass hole | ||
assfucker | ||
asshole | ||
assshole | ||
bastard | ||
bitch | ||
black cock | ||
bloody hell | ||
boong | ||
cock | ||
cockfucker | ||
cocksuck | ||
cocksucker | ||
coon | ||
coonnass | ||
crap | ||
cunt | ||
cyberfuck | ||
damn | ||
darn | ||
dick | ||
dirty | ||
douche | ||
dummy | ||
erect | ||
erection | ||
erotic | ||
escort | ||
fag | ||
faggot | ||
fuck | ||
Fuck off | ||
fuck you | ||
fuckass | ||
fuckhole | ||
god damn | ||
gook | ||
hard core | ||
hardcore | ||
homoerotic | ||
hore | ||
lesbian | ||
lesbians | ||
mother fucker | ||
motherfuck | ||
motherfucker | ||
negro | ||
nigger | ||
orgasim | ||
orgasm | ||
penis | ||
penisfucker | ||
piss | ||
piss off | ||
porn | ||
porno | ||
pornography | ||
pussy | ||
retard | ||
sadist | ||
sex | ||
sexy | ||
shit | ||
slut | ||
son of a bitch | ||
suck | ||
tits | ||
viagra | ||
whore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from safetext.languages.base import BaseProfanityChecker | ||
|
||
|
||
class TurkishProfanityChecker(BaseProfanityChecker): | ||
"""Turkish profanity checker.""" | ||
|
||
def __init__(self): | ||
super().__init__(language="tr") |
Oops, something went wrong.