-
Notifications
You must be signed in to change notification settings - Fork 0
/
suxotin.py
192 lines (159 loc) · 9.26 KB
/
suxotin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import time
import numpy as np
from pathlib import Path
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg', quiet=True) # Download the Gutenberg Corpus if not already downloaded
def preprocess_text(text) -> str:
"""
Convert the input text to lowercase and remove all characters except alphabets and spaces.
Args:
text (str): The text to preprocess.
Returns:
str: The preprocessed text with only lowercase letters and spaces.
"""
# Iterate through text, convert to lower case if alphabetic or space, otherwise remove
return ''.join(char.lower() if char.isalpha() or char.isspace() else '' for char in text)
def create_frequency_matrix(text) -> tuple[np.ndarray, dict]:
"""
Create a frequency matrix of adjacent characters in the text based on unique characters.
This matrix is symmetric and counts the adjacencies for each pair of characters in the text.
The matrix and character index mapping are essential for analyzing character adjacency relationships.
Args:
text (str): The text from which to create the frequency matrix.
Returns:
tuple: A tuple containing the matrix as a NumPy array and a dictionary mapping characters to indices,
which serves as the legend for understanding the matrix indices.
"""
# Extract all unique characters from the text to define the matrix dimension
unique_chars = set(text)
# Map each unique character to a unique index using dictionary comprehension
char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
# Determine the size of the matrix based on the number of unique characters
size = len(unique_chars)
# Initialize a square matrix of zeros with dimensions [size x size]
matrix = np.zeros((size, size), dtype=int)
# Loop through the text, character by character, excluding the last character to prevent index error
for i in range(len(text) - 1):
# Retrieve indices from the mapping for current and next character
l_idx, r_idx = char_to_index[text[i]], char_to_index[text[i+1]]
# Increment the position in the matrix for both (l_idx, r_idx) and (r_idx, l_idx) to maintain symmetry
matrix[l_idx][r_idx] += 1
matrix[r_idx][l_idx] += 1 # Ensures the matrix is symmetric
# Ensure diagonal entries are zero to ignore self-pairings, as a character adjacent to itself is not considered
np.fill_diagonal(matrix, 0)
return matrix, char_to_index
def classify_vowels(sums, matrix, char_to_index) -> tuple[set, set]:
"""
Classify characters as vowels or consonants based on their adjacency counts within a given text.
Characters with higher adjacency counts are initially considered for classification as vowels,
as they typically appear more frequently next to a variety of other characters. The classification
process iteratively selects the character with the highest adjacency count, adjusts the counts,
and then classifies the remaining characters as consonants.
Args:
sums (np.ndarray): Array of sums for each character's adjacencies, where each sum is the total count
of adjacencies involving the corresponding character.
matrix (np.ndarray): The frequency matrix of character adjacencies, where the element at [i, j]
indicates the adjacency count between characters i and j.
char_to_index (dict): Mapping of characters to their respective indices in the matrix. This mapping
helps identify characters from indices in the matrix.
Returns:
tuple: A tuple containing two sets, one of characters classified as vowels and the other as consonants.
"""
classified_vowels = set() # Set to store characters classified as vowels
classified_consonants = set() # Initially empty set for consonants
index_to_char = {idx: char for char, idx in char_to_index.items()} # Reverse mapping from indices to characters
remaining_sums = sums.copy() # Copy of the sums array to avoid altering the original during classification
# Iterate over sums to classify characters with the highest adjacency counts as vowels
while np.any(remaining_sums > 0):
vowel_idx = np.argmax(remaining_sums) # Find index with the highest adjacency count
if remaining_sums[vowel_idx] > 0: # Check if the highest count is positive
char = index_to_char[vowel_idx]
classified_vowels.add(char) # Classify character as a vowel
remaining_sums -= 2 * matrix[:, vowel_idx] # Reduce sums by counts associated with the newly classified vowel
remaining_sums[vowel_idx] = 0 # Zero out the sum for the classified vowel to prevent reclassification
# Use set comprehension to classify the remaining characters as consonants
classified_consonants = {
index_to_char[idx] for idx in range(len(sums))
if index_to_char[idx].isalpha() and index_to_char[idx] not in classified_vowels # Ensure character is alphabetic and not already classified as a vowel
}
return classified_vowels, classified_consonants
def suxotins_algorithm(text: str, preprocess: bool = True) -> tuple[set, set]:
"""
Apply Suxotin's algorithm to classify characters in a given text into vowels and consonants based on their adjacency frequencies.
This method involves processing the text through a frequency matrix that counts adjacencies between characters.
Characters with higher adjacency counts are more likely to be classified as vowels, a decision based on the assumption
that vowels generally appear more frequently and adjacent to a variety of other characters.
Args:
text (str): The text to be analyzed and processed.
preprocess (bool): Indicates whether the text should be preprocessed to normalize it (e.g., converting to lowercase and removing non-alphabetic characters).
The default value is True, which applies preprocessing.
Returns:
tuple: A tuple containing two sets:
- The first set includes characters classified as vowels.
- The second set includes characters classified as consonants.
"""
# Conditionally preprocess the text to remove non-alphabetic characters and convert to lowercase for uniformity
if preprocess:
text = preprocess_text(text)
# Generate a frequency matrix and a character-to-index mapping from the processed text
matrix, char_to_index = create_frequency_matrix(text)
# Calculate the adjacency sums which will be used to classify characters
sums = matrix.sum(axis=1)
# Classify characters into vowels and consonants based on the calculated sums and the frequency matrix
vowels, consonants = classify_vowels(sums, matrix, char_to_index)
# Return the classified sets of vowels and consonants
return vowels, consonants
def get_preprocess_confirmation() -> bool:
"""
Prompt the user to confirm if they want to preprocess the text.
Accepts any input starting with 'y' or 'n' as a valid response and is case insensitive.
Returns:
bool: True if the user confirms preprocessing, False otherwise.
"""
while True:
preprocess_input = input("Do you want to preprocess the text? (yes/no): ").strip().lower()
if preprocess_input.startswith('y'):
return True
elif preprocess_input.startswith('n'):
return False
else:
print("Invalid input. Please answer with 'yes' or 'no'.")
def main():
"""
Main execution function to process a text file or Gutenberg Corpus and classify characters using Suxotin's algorithm.
Filters and prints sorted, printable vowels and consonants, excluding spaces and newlines.
Enhances user input flexibility and organizes output for better readability.
"""
# Gather user inputs first
data_source = input("Choose the data source - 'local' for local file, 'nltk' for Gutenberg Corpus: ").lower()
preprocess = get_preprocess_confirmation()
# Start timing after user input
start_time = time.perf_counter()
try:
if data_source == 'local':
file_path = Path('sherlock_holmes.txt')
with file_path.open('r', encoding='utf-8') as file:
text = file.read()
elif data_source == 'nltk':
text = ' '.join(gutenberg.words())
else:
raise ValueError("Invalid data source selected.")
vowels, consonants = suxotins_algorithm(text, preprocess)
# Using list comprehension to filter out non-printable characters
printable_vowels = sorted([v for v in vowels if v not in ' \n\t'])
printable_consonants = sorted([c for c in consonants if c not in ' \n\t'])
# Improved output formatting for clarity and aesthetics
print("\nClassified Characters:")
print("Vowels: ", ', '.join(printable_vowels))
print("Consonants: ", ', '.join(printable_consonants))
except FileNotFoundError:
print("File not found. Ensure the file is in the correct directory.")
except ValueError as e:
print(str(e))
except Exception as e:
print(f"An unexpected error occurred: {str(e)}")
end_time = time.perf_counter() # End timing
print(f"Execution time: {end_time - start_time:.4f} seconds") # Print execution time
if __name__ == '__main__':
main()