-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper_utils.py
61 lines (42 loc) · 1.86 KB
/
helper_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import numpy as np
from pypdf import PdfReader
from tqdm import tqdm
def _read_pdf(filename):
reader = PdfReader(filename)
pdf_texts = [p.extract_text().strip() for p in reader.pages]
# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]
return pdf_texts
def _chunk_texts(texts):
character_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". ", " ", ""],
chunk_size=1000,
chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(texts))
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
token_split_texts = []
for text in character_split_texts:
token_split_texts += token_splitter.split_text(text)
return token_split_texts
def load_chroma(filename, collection_name, embedding_function):
texts = _read_pdf(filename)
chunks = _chunk_texts(texts)
chroma_cliet = chromadb.Client()
chroma_collection = chroma_cliet.create_collection(name=collection_name, embedding_function=embedding_function)
ids = [str(i) for i in range(len(chunks))]
chroma_collection.add(ids=ids, documents=chunks)
return chroma_collection
def word_wrap(string, n_chars=72):
# Wrap a string at the next space after n_chars
if len(string) < n_chars:
return string
else:
return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)
def project_embeddings(embeddings, umap_transform):
umap_embeddings = np.empty((len(embeddings),2))
for i, embedding in enumerate(tqdm(embeddings)):
umap_embeddings[i] = umap_transform.transform([embedding])
return umap_embeddings