From 7021cf5971eb5230b6b4057616230608d0a7f126 Mon Sep 17 00:00:00 2001 From: silvermete0r Date: Wed, 11 Sep 2024 10:08:57 +0500 Subject: [PATCH] 11-sep-update-qaznltk --- README.md | 19 ++++--- app.py | 7 +++ pyproject.toml | 4 +- src/qaznltk/qaznltk.py | 116 +++++++++++++++++++++++++++++++++++------ 4 files changed, 123 insertions(+), 23 deletions(-) create mode 100644 app.py diff --git a/README.md b/README.md index c349a4d..f2e6d50 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ similarity_score = qn.calc_similarity(textA, textB) print(similarity_score) # Input: textA = "Еңбегіне қарай — құрмет, Жасына қарай — ізет.", textB = "Еңбегіне қарай табысы, Ерлігіне қарай дабысы." -# Output: 0.2222222222222222 +# Output: 0.368421052631579 ``` 4) Convert Kazakh language Text from Cyrillic to Latin using ISO-9 Standard: @@ -101,7 +101,7 @@ sentimize_score = qnltk.sentimize(text) print(sentimize_score) # Input: Бұл мақала өте нашар жазылған. -# Output: -1 (negative) +# Output: -1.0 (negative) ``` 7) Converting any number `N` into kazakh language number words [`N <= 10^31`]: @@ -116,6 +116,17 @@ print(qnltk.num2word(n)) # Output: бір мың төрт жүз алпыс бес ``` +``` Python +from qaznltk import qaznltk as qnltk +qn = qnltk.QazNLTK() + +iin = input("Enter IIN: ") +print(qnltk.get_info_from_iin(iin)) + +# Input: 990408482390 +# Output: {'status': 'success', 'date_of_birth': '08.04.1999', 'century_of_birth': '20', 'gender': 'female', 'sequence_number': 8239, 'control_discharge': 0} +``` + * **Test Samples:** https://vk.com/club121755042 ## Where to get it @@ -130,10 +141,6 @@ pip install qaznltk ![image](https://github.com/silvermete0r/QazNLTK/assets/108217670/b1e8eaa1-f25f-4019-9d75-dee8d25d6a28) -The list of changes to pandas between each release can be found -[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full -details, see the commit logs at https://github.com/pandas-dev/pandas. - ## Dependencies - Package was developed on built-in python functions; diff --git a/app.py b/app.py new file mode 100644 index 0000000..9294f6c --- /dev/null +++ b/app.py @@ -0,0 +1,7 @@ +from qaznltk import qaznltk as qnltk + +qn = qnltk.QazNLTK() + +text = input() + +print(qn.convert2latin(text)) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d1a1f7b..81516cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,9 +4,9 @@ build-backend = "setuptools.build_meta" [project] name = "qaznltk" -version = "1.1.5" +version = "1.2.2" authors = [ - { name="Aibol Zhussip, Kadyr Saparbay & Arman Zhalgasbayev", email="supwithproject@gmail.com" }, + { name="Aibol Zhussip, Kadyr Saparbay, Tair Ussen & Arman Zhalgasbayev", email="supwithproject@gmail.com" }, ] description = "A package for working with Kazakh language text processing." readme = "README.md" diff --git a/src/qaznltk/qaznltk.py b/src/qaznltk/qaznltk.py index 41ce3fc..b1e5612 100644 --- a/src/qaznltk/qaznltk.py +++ b/src/qaznltk/qaznltk.py @@ -1,6 +1,7 @@ import re from typing import List from collections import Counter +from functools import lru_cache class QazNLTK: ''' @@ -62,10 +63,43 @@ class QazNLTK: positive_words = set() negative_words = set() + @lru_cache(maxsize=None) def __init__(self, stop_words_file="special_words/stop_words.txt", positive_words_file="special_words/positive_words.txt", negative_words_file="special_words/negative_words.txt") -> None: QazNLTK.stop_words = set(self.load_words(stop_words_file)) QazNLTK.positive_words = set(self.load_words(positive_words_file)) QazNLTK.negative_words = set(self.load_words(negative_words_file)) + + @staticmethod + def __preprocess_text(text: str) -> str: + return re.sub(r'\W+', '', text.lower()) + + @staticmethod + def __jaccard_similarity(str1: str, str2: str) -> float: + # Calculate Jaccard similarity: https://en.wikipedia.org/wiki/Jaccard_index + set1 = set(str1) + set2 = set(str2) + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + return intersection / union if union != 0 else 0 + + @staticmethod + @lru_cache(maxsize=None) + def __levenshtein_distance(s1: str, s2: str) -> int: + # Calculate Levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance + if len(s1) < len(s2): + return QazNLTK.__levenshtein_distance(s2, s1) + if len(s2) == 0: + return len(s1) + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + return previous_row[-1] @staticmethod def load_words(words_file: str) -> List[str]: @@ -80,7 +114,7 @@ def load_words(words_file: str) -> List[str]: return [] @classmethod - def convert2cyrillic(cls, text: str) -> str: + def convert2cyrillic_iso9(cls, text: str) -> str: # Convert kazakh latin to cyrillic cyrillic_text = '' reversed_mapping = {v: k for k, v in cls.cyrillic_to_iso_9_mapping.items()} @@ -91,7 +125,7 @@ def convert2cyrillic(cls, text: str) -> str: return cyrillic_text @classmethod - def convert2latin(cls, text: str) -> str: + def convert2latin_iso9(cls, text: str) -> str: # Convert kazakh cyrillic to latin latin_text = '' @@ -121,22 +155,70 @@ def sent_tokenize(cls, text: str) -> List[str]: sentences = [sentence.strip() for sentence in sentences] return sentences - + @staticmethod def calc_similarity(textA: str, textB: str) -> float: - # Jaccard Similarity Calculation - setA = set(textA.split()) - setB = set(textB.split()) + # Calculate similarity between two texts + str1 = QazNLTK.__preprocess_text(textA) + str2 = QazNLTK.__preprocess_text(textB) + + if not str1 or not str2: + return 0 + + jaccard_similarity = QazNLTK.__jaccard_similarity(str1, str2) + levenshtein_distance = QazNLTK.__levenshtein_distance(str1, str2) - intersection = len(setA.intersection(setB)) - union = len(setA) + len(setB) - intersection + similarity_score = (jaccard_similarity + 1 / (levenshtein_distance + 1)) / 2 + + return similarity_score - similarity_score = intersection / union if union != 0 else 0.0 + @staticmethod + def get_info_from_iin(iin: str) -> dict: + # Get information from IIN + if len(iin) != 12 or not iin.isdigit(): + return {"status": "error", "message": "Incorrect IIN. The length of the IIN must be 12 digits."} - return similarity_score + year = int(iin[:2]) + month = int(iin[2:4]) + day = int(iin[4:6]) + sequence_number = int(iin[7:11]) + control_discharge = int(iin[11:12]) + + gender_code = int(iin[6]) + + if gender_code in [1, 2]: + year += 1800 + elif gender_code in [3, 4]: + year += 1900 + elif gender_code in [5, 6]: + year += 2000 + else: + return {"status": "error", "message": "Incorrect IIN. The first digit of the IIN must be in the range [1, 6]."} + + if gender_code % 2 == 1: + gender = "male" + else: + gender = "female" + + try: + from datetime import datetime + birth_date = datetime(year, month, day) + except ValueError: + return {"status": "error", "message": "Incorrect IIN. The date of birth is incorrect."} + + info = { + "status": "success", + "date_of_birth": birth_date.strftime("%d.%m.%Y"), + "century_of_birth": f"{year // 100 + 1}", + "gender": gender, + "sequence_number": sequence_number, + "control_discharge": control_discharge + } + + return info @classmethod - def sentimize(cls, tokens) -> int: + def sentimize(cls, tokens) -> float: # Sentiment analysis by tokens if type(tokens) == str: tokens = cls.tokenize(tokens) @@ -151,11 +233,11 @@ def sentimize(cls, tokens) -> int: negative_score += freq if positive_score > negative_score: - return 1 + return 1.0 elif positive_score < negative_score: - return -1 + return -1.0 else: - return 0 + return 0.0 @staticmethod def num2word(n): @@ -181,6 +263,7 @@ def num2word(n): "90": "тоқсан" } mp = { + 0: "нөл", 1: "он", 2: "жүз", 3: "мың", @@ -264,4 +347,7 @@ def num2word(n): # print(sentimize_score) # n = int(input()) - # print(qnltk.num2word(n)) \ No newline at end of file + # print(qnltk.num2word(n)) + + iin = input("Enter IIN: ") + print(qnltk.get_info_from_iin(iin)) \ No newline at end of file