Skip to content

Commit

Permalink
11-sep-update-qaznltk
Browse files Browse the repository at this point in the history
  • Loading branch information
silvermete0r committed Sep 11, 2024
1 parent 4bb7d9e commit 7021cf5
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 23 deletions.
19 changes: 13 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ similarity_score = qn.calc_similarity(textA, textB)
print(similarity_score)

# Input: textA = "Еңбегіне қарай — құрмет, Жасына қарай — ізет.", textB = "Еңбегіне қарай табысы, Ерлігіне қарай дабысы."
# Output: 0.2222222222222222
# Output: 0.368421052631579
```

4) Convert Kazakh language Text from Cyrillic to Latin using ISO-9 Standard:
Expand Down Expand Up @@ -101,7 +101,7 @@ sentimize_score = qnltk.sentimize(text)
print(sentimize_score)

# Input: Бұл мақала өте нашар жазылған.
# Output: -1 (negative)
# Output: -1.0 (negative)
```

7) Converting any number `N` into kazakh language number words [`N <= 10^31`]:
Expand All @@ -116,6 +116,17 @@ print(qnltk.num2word(n))
# Output: бір мың төрт жүз алпыс бес
```

``` Python
from qaznltk import qaznltk as qnltk
qn = qnltk.QazNLTK()

iin = input("Enter IIN: ")
print(qnltk.get_info_from_iin(iin))

# Input: 990408482390
# Output: {'status': 'success', 'date_of_birth': '08.04.1999', 'century_of_birth': '20', 'gender': 'female', 'sequence_number': 8239, 'control_discharge': 0}
```

* **Test Samples:** https://vk.com/club121755042

## Where to get it
Expand All @@ -130,10 +141,6 @@ pip install qaznltk
![image](https://github.com/silvermete0r/QazNLTK/assets/108217670/b1e8eaa1-f25f-4019-9d75-dee8d25d6a28)


The list of changes to pandas between each release can be found
[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full
details, see the commit logs at https://github.com/pandas-dev/pandas.

## Dependencies
- Package was developed on built-in python functions;

Expand Down
7 changes: 7 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from qaznltk import qaznltk as qnltk

qn = qnltk.QazNLTK()

text = input()

print(qn.convert2latin(text))
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ build-backend = "setuptools.build_meta"

[project]
name = "qaznltk"
version = "1.1.5"
version = "1.2.2"
authors = [
{ name="Aibol Zhussip, Kadyr Saparbay & Arman Zhalgasbayev", email="supwithproject@gmail.com" },
{ name="Aibol Zhussip, Kadyr Saparbay, Tair Ussen & Arman Zhalgasbayev", email="supwithproject@gmail.com" },
]
description = "A package for working with Kazakh language text processing."
readme = "README.md"
Expand Down
116 changes: 101 additions & 15 deletions src/qaznltk/qaznltk.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import List
from collections import Counter
from functools import lru_cache

class QazNLTK:
'''
Expand Down Expand Up @@ -62,10 +63,43 @@ class QazNLTK:
positive_words = set()
negative_words = set()

@lru_cache(maxsize=None)
def __init__(self, stop_words_file="special_words/stop_words.txt", positive_words_file="special_words/positive_words.txt", negative_words_file="special_words/negative_words.txt") -> None:
QazNLTK.stop_words = set(self.load_words(stop_words_file))
QazNLTK.positive_words = set(self.load_words(positive_words_file))
QazNLTK.negative_words = set(self.load_words(negative_words_file))

@staticmethod
def __preprocess_text(text: str) -> str:
return re.sub(r'\W+', '', text.lower())

@staticmethod
def __jaccard_similarity(str1: str, str2: str) -> float:
# Calculate Jaccard similarity: https://en.wikipedia.org/wiki/Jaccard_index
set1 = set(str1)
set2 = set(str2)
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union != 0 else 0

@staticmethod
@lru_cache(maxsize=None)
def __levenshtein_distance(s1: str, s2: str) -> int:
# Calculate Levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance
if len(s1) < len(s2):
return QazNLTK.__levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]

@staticmethod
def load_words(words_file: str) -> List[str]:
Expand All @@ -80,7 +114,7 @@ def load_words(words_file: str) -> List[str]:
return []

@classmethod
def convert2cyrillic(cls, text: str) -> str:
def convert2cyrillic_iso9(cls, text: str) -> str:
# Convert kazakh latin to cyrillic
cyrillic_text = ''
reversed_mapping = {v: k for k, v in cls.cyrillic_to_iso_9_mapping.items()}
Expand All @@ -91,7 +125,7 @@ def convert2cyrillic(cls, text: str) -> str:
return cyrillic_text

@classmethod
def convert2latin(cls, text: str) -> str:
def convert2latin_iso9(cls, text: str) -> str:
# Convert kazakh cyrillic to latin
latin_text = ''

Expand Down Expand Up @@ -121,22 +155,70 @@ def sent_tokenize(cls, text: str) -> List[str]:
sentences = [sentence.strip() for sentence in sentences]

return sentences

@staticmethod
def calc_similarity(textA: str, textB: str) -> float:
# Jaccard Similarity Calculation
setA = set(textA.split())
setB = set(textB.split())
# Calculate similarity between two texts
str1 = QazNLTK.__preprocess_text(textA)
str2 = QazNLTK.__preprocess_text(textB)

if not str1 or not str2:
return 0

jaccard_similarity = QazNLTK.__jaccard_similarity(str1, str2)
levenshtein_distance = QazNLTK.__levenshtein_distance(str1, str2)

intersection = len(setA.intersection(setB))
union = len(setA) + len(setB) - intersection
similarity_score = (jaccard_similarity + 1 / (levenshtein_distance + 1)) / 2

return similarity_score

similarity_score = intersection / union if union != 0 else 0.0
@staticmethod
def get_info_from_iin(iin: str) -> dict:
# Get information from IIN
if len(iin) != 12 or not iin.isdigit():
return {"status": "error", "message": "Incorrect IIN. The length of the IIN must be 12 digits."}

return similarity_score
year = int(iin[:2])
month = int(iin[2:4])
day = int(iin[4:6])
sequence_number = int(iin[7:11])
control_discharge = int(iin[11:12])

gender_code = int(iin[6])

if gender_code in [1, 2]:
year += 1800
elif gender_code in [3, 4]:
year += 1900
elif gender_code in [5, 6]:
year += 2000
else:
return {"status": "error", "message": "Incorrect IIN. The first digit of the IIN must be in the range [1, 6]."}

if gender_code % 2 == 1:
gender = "male"
else:
gender = "female"

try:
from datetime import datetime
birth_date = datetime(year, month, day)
except ValueError:
return {"status": "error", "message": "Incorrect IIN. The date of birth is incorrect."}

info = {
"status": "success",
"date_of_birth": birth_date.strftime("%d.%m.%Y"),
"century_of_birth": f"{year // 100 + 1}",
"gender": gender,
"sequence_number": sequence_number,
"control_discharge": control_discharge
}

return info

@classmethod
def sentimize(cls, tokens) -> int:
def sentimize(cls, tokens) -> float:
# Sentiment analysis by tokens
if type(tokens) == str:
tokens = cls.tokenize(tokens)
Expand All @@ -151,11 +233,11 @@ def sentimize(cls, tokens) -> int:
negative_score += freq

if positive_score > negative_score:
return 1
return 1.0
elif positive_score < negative_score:
return -1
return -1.0
else:
return 0
return 0.0

@staticmethod
def num2word(n):
Expand All @@ -181,6 +263,7 @@ def num2word(n):
"90": "тоқсан"
}
mp = {
0: "нөл",
1: "он",
2: "жүз",
3: "мың",
Expand Down Expand Up @@ -264,4 +347,7 @@ def num2word(n):
# print(sentimize_score)

# n = int(input())
# print(qnltk.num2word(n))
# print(qnltk.num2word(n))

iin = input("Enter IIN: ")
print(qnltk.get_info_from_iin(iin))

0 comments on commit 7021cf5

Please sign in to comment.