Skip to content

Commit

Permalink
feat: add file grouping feature (#416) bump:patch
Browse files Browse the repository at this point in the history
  • Loading branch information
taprosoft authored Oct 21, 2024
1 parent 2bc1b01 commit 764fe59
Show file tree
Hide file tree
Showing 3 changed files with 413 additions and 81 deletions.
20 changes: 20 additions & 0 deletions libs/ktem/ktem/index/file/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,31 @@ def _setup_resources(self):
"user": Column(Integer, default=1),
},
)
FileGroup = type(
"FileGroupTable",
(Base,),
{
"__tablename__": f"index__{self.id}__group",
"id": Column(Integer, primary_key=True, autoincrement=True),
"date_created": Column(
DateTime(timezone=True), server_default=func.now()
),
"name": Column(String, unique=True),
"user": Column(Integer, default=1),
"data": Column(
MutableDict.as_mutable(JSON), # type: ignore
default={"files": []},
),
},
)

self._vs: BaseVectorStore = get_vectorstore(f"index_{self.id}")
self._docstore: BaseDocumentStore = get_docstore(f"index_{self.id}")
self._fs_path = filestorage_path / f"index_{self.id}"
self._resources = {
"Source": Source,
"Index": Index,
"FileGroup": FileGroup,
"VectorStore": self._vs,
"DocStore": self._docstore,
"FileStoragePath": self._fs_path,
Expand Down Expand Up @@ -297,6 +315,7 @@ def on_create(self):
self._setup_resources()
self._resources["Source"].metadata.create_all(engine) # type: ignore
self._resources["Index"].metadata.create_all(engine) # type: ignore
self._resources["FileGroup"].metadata.create_all(engine) # type: ignore
self._fs_path.mkdir(parents=True, exist_ok=True)

def on_delete(self):
Expand All @@ -306,6 +325,7 @@ def on_delete(self):
self._setup_resources()
self._resources["Source"].__table__.drop(engine) # type: ignore
self._resources["Index"].__table__.drop(engine) # type: ignore
self._resources["FileGroup"].__table__.drop(engine) # type: ignore
self._vs.drop()
self._docstore.drop()
shutil.rmtree(self._fs_path)
Expand Down
11 changes: 11 additions & 0 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
import logging
import shutil
import threading
Expand Down Expand Up @@ -120,6 +121,16 @@ def run(
text: the text to retrieve similar documents
doc_ids: list of document ids to constraint the retrieval
"""
# flatten doc_ids in case of group of doc_ids are passed
if doc_ids:
flatten_doc_ids = []
for doc_id in doc_ids:
if doc_id.startswith("["):
flatten_doc_ids.extend(json.loads(doc_id))
else:
flatten_doc_ids.append(doc_id)
doc_ids = flatten_doc_ids

print("searching in doc_ids", doc_ids)
if not doc_ids:
logger.info(f"Skip retrieval because of no selected files: {self}")
Expand Down
Loading

0 comments on commit 764fe59

Please sign in to comment.