-
Notifications
You must be signed in to change notification settings - Fork 9
/
text2embed.py
36 lines (27 loc) · 1000 Bytes
/
text2embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
from bpemb import BPEmb
""" Subword Embeddings: https://nlp.h-its.org/bpemb """
class Text2Embed:
def __init__(self):
self.bpemb_en = BPEmb(lang="en", vs=100000, dim=300)
def to_tokens(self, word):
tokens = self.bpemb_en.encode(word)
return tokens
def to_embed(self, word, mean=True):
embed = self.bpemb_en.embed(word)
if mean == True and len(embed) > 1:
embed = np.mean(embed, axis=0)
embed = np.expand_dims(embed, axis=0)
return embed
if __name__ == "__main__":
# words = ["polyp", "instrument", "nuclei", "skin cancer", "neural structure"]
words = ["small", "medium", "large"]
embed = Text2Embed()
embed_vec = []
for word in words:
tokens = embed.to_tokens(word)
vec = embed.to_embed(word, mean=False)
embed_vec.append(vec)
print(f"Tokens: {tokens} - Vec: {vec.shape}")
embed_vec = np.array(embed_vec)
print(embed_vec.shape)