-
Notifications
You must be signed in to change notification settings - Fork 0
/
Analogies with Nepali Language.py
82 lines (68 loc) · 2.29 KB
/
Analogies with Nepali Language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
"""
Date : 04/30/2020
Author : Suman Sigdel
File : Analogies with Nepali Language, Python file that contains functions that find the analogies in Nepali language using a pretrained word2vec model.
"""
# Imports
import numpy as np
## File Paths
word_2_vec = "nepali_embeddings_word2vec.txt"
# Function that reads the pretrained file
def read_word_to_vecs(pretrained_file):
with open(pretrained_file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec_map
words, word_to_vec_map = read_word_to_vecs(word_2_vec)
def cosine_similarity(u,v):
"""
u : Word vector
v : Word vector
Returns :
Cosine similarity of two words ie. u & v
"""
# dot product of the word vectors
dot_product = np.dot(u,v)
# L2 norms ie. Euclidean Distance
dist_u = np.sqrt(np.sum(u**2))
dist_v = np.sqrt(np.sum(v**2))
cosine_similarity = dot_product/(dist_u*dist_v)
return cosine_similarity
word_u = word_to_vec_map["ठमेल"]
word_v = word_to_vec_map["न्यूरोड"]
print(cosine_similarity(word_u,word_v))
def analogy_finder(word_a, word_b, word_c, word_2_vec_map, word_list):
"""
word_a :
word_b :
word_c :
word_2_vec_map :
word_list :
"""
e_a, e_b, e_c = word_2_vec_map[word_a],word_to_vec_map[word_b],word_to_vec_map[word_c]
words = word_list
max_cosine_similarity = -100.00 # Big negative number
final_word = None
input_words_set = set([word_a, word_b, word_c])
for word in words:
# If same words are found
if word in input_words_set :
continue
# Otherwise, compute the cosine similarity
sim = cosine_similarity((e_b-e_a), (word_2_vec_map[word]-e_c))
if sim > max_cosine_similarity :
print(word)
max_cosine_similarity = sim
final_word = word
return final_word
# Example words
word_a = "केटा"
word_b = "केटी"
word_c = "राजा"
print(analogy_finder(word_a,word_b,word_c,word_to_vec_map, words))