Skip to content

Commit

Permalink
feat: updates
Browse files Browse the repository at this point in the history
  • Loading branch information
leehanchung committed Nov 26, 2023
1 parent c424c21 commit 435b9af
Show file tree
Hide file tree
Showing 3 changed files with 1,834 additions and 1,400 deletions.
7 changes: 6 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
OPENAI_ORG_ID="org-your-org-id"
ALLOW_RESET=TRUE
OPENAI_API_KEY="sk-your-openai-api-key"

LANGCHAIN_TRACING_V2="true"
LANGCHAIN_ENDPOINT="https://api.langchain.plus"
LANGCHAIN_API_KEY="ls_your-langsmith-key-here"
LANGCHAIN_PROJECT=chat-with-pdf
93 changes: 58 additions & 35 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from typing import List

from tempfile import NamedTemporaryFile

import chainlit as cl
from chainlit.types import AskFileResponse
import chromadb
from chromadb.config import Settings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.base import Chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PDFPlumberLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.schema.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores.base import VectorStore
Expand All @@ -28,10 +32,9 @@
"""


def process_file(*, file: AskFileResponse) -> list:
def process_file(*, file: AskFileResponse) -> List[Document]:
if file.type != "application/pdf":
raise TypeError("Only PDF files are supported")


with NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
Expand All @@ -55,40 +58,34 @@ def process_file(*, file: AskFileResponse) -> list:
chunk_overlap=100
)
######################################################################

docs = text_splitter.split_documents(documents)

for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
return docs

if not docs:
raise ValueError("PDF file parsing failed.")

def create_search_engine(*, file: AskFileResponse) -> VectorStore:
docs = process_file(file=file)


##########################################################################
#
# 3. Set the Encoder model for creating embeddings
#
##########################################################################
encoder = OpenAIEmbeddings(
model="text-embedding-ada-002"
)
##########################################################################
return docs

# Save data in the user session
cl.user_session.set("docs", docs)

search_engine = Chroma(persist_directory=".chromadb")
search_engine._client.reset()
def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:

# Initialize Chromadb client to enable resetting and disable telemtry
client = chromadb.EphemeralClient()
client_settings = Settings(
chroma_db_impl="duckdb+parquet",
anonymized_telemetry=False,
persist_directory=".chromadb",
allow_reset=True
allow_reset=True )

# Reset the search engine to ensure we don't use old copies.
# NOTE: we do not need this for production
search_engine = Chroma(
client=client,
client_settings=client_settings
)
search_engine._client.reset()

##########################################################################
#
Expand All @@ -97,35 +94,56 @@ def create_search_engine(*, file: AskFileResponse) -> VectorStore:
#
##########################################################################
search_engine = Chroma.from_documents(
client=client,
documents=docs,
embedding=encoder,
metadatas=[doc.metadata for doc in docs],
embedding=embeddings,
client_settings=client_settings
)
##########################################################################

return search_engine


@cl.langchain_factory(use_async=True)
async def chat() -> Chain:
@cl.on_chat_start
async def on_chat_start():

# Asking user to to upload a PDF to chat with
files = None
while files is None:
files = await cl.AskFileMessage(
content=WELCOME_MESSAGE,
accept=["application/pdf"],
max_size_mb=20,
).send()

file = files[0]

# Process and save data in the user session
msg = cl.Message(content=f"Processing `{file.name}`...")
await msg.send()

docs = process_file(file=file)
cl.user_session.set("docs", docs)
msg.content = f"`{file.name}` processed. Loading ..."
await msg.update()

##########################################################################
#
# 3. Set the Encoder model for creating embeddings
#
##########################################################################
embeddings = OpenAIEmbeddings(
model="text-embedding-ada-002"
)
##########################################################################
try:
search_engine = await cl.make_async(create_search_engine)(file=file)
search_engine = await cl.make_async(create_search_engine)(
docs=docs,
embeddings=embeddings
)
except Exception as e:
await cl.Message(content=f"Error: {e}").send()
raise SystemError
msg.content = f"`{file.name}` loaded. You can now ask questions!"
await msg.update()

llm = ChatOpenAI(
model='gpt-3.5-turbo-16k-0613',
Expand Down Expand Up @@ -153,15 +171,20 @@ async def chat() -> Chain:
)
##########################################################################

await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
cl.user_session.set("chain", chain)

return chain

@cl.on_message
async def main(message: cl.Message):

chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
response = await chain.acall(
message.content,
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)]
)

@cl.langchain_postprocess
async def process_response(res):
answer = res["answer"]
sources = res["sources"].strip()
answer = response["answer"]
sources = response["sources"].strip()
source_elements = []

# Get the documents from the user session
Expand Down
Loading

0 comments on commit 435b9af

Please sign in to comment.