Skip to content

Commit

Permalink
feat: docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
leehanchung committed Nov 26, 2023
1 parent 7c7e163 commit 3de806c
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 54 deletions.
110 changes: 59 additions & 51 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,40 +21,36 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores.base import VectorStore
from prompts import EXAMPLE_PROMPT, PROMPT
from prompts import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE


WELCOME_MESSAGE = """\
Welcome to Introduction to LLM App Development Sample PDF QA Application!
To get started:
1. Upload a PDF or text file
2. Ask any question about the file!
"""
def process_file(*, file: AskFileResponse) -> List[Document]:
"""Takes a Chailit AskFileResponse, get the document and process and chunk
it into a list of Langchain's Documents. Each Document has page_content and
matadata fields. Supports PDF files only.
Args:
file (AskFileResponse): User's file input
def process_file(*, file: AskFileResponse) -> List[Document]:
Raises:
TypeError: when the file type is not pdf
ValueError: when the PDF is not parseable
Returns:
List[Document]: chunked documents
"""
if file.type != "application/pdf":
raise TypeError("Only PDF files are supported")

with NamedTemporaryFile() as tempfile:
tempfile.write(file.content)

######################################################################
#
# 1. Load the PDF
#
######################################################################
loader = PDFPlumberLoader(tempfile.name)
######################################################################
documents = loader.load()

######################################################################
#
# 2. Split the text
#
######################################################################
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
######################################################################
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(documents)

for i, doc in enumerate(docs):
Expand All @@ -66,34 +62,51 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
return docs


def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:
def create_search_engine(
*, docs: List[Document], embeddings: Embeddings
) -> VectorStore:
"""Takes a list of Langchain Documents and an Langchain embeddings wrapper
over encoder models, and index the data into a ChromaDB as a search engine
Args:
docs (List[Document]): list of documents to be ingested
embeddings (Embeddings): encoder model
Returns:
VectorStore: vector store for RAG
"""
# Initialize Chromadb client to enable resetting and disable telemtry
client = chromadb.EphemeralClient()
client_settings = Settings(
chroma_db_impl="duckdb+parquet", anonymized_telemetry=False, persist_directory=".chromadb", allow_reset=True
chroma_db_impl="duckdb+parquet",
anonymized_telemetry=False,
persist_directory=".chromadb",
allow_reset=True,
)

# Reset the search engine to ensure we don't use old copies.
# NOTE: we do not need this for production
search_engine = Chroma(client=client, client_settings=client_settings)
search_engine._client.reset()

##########################################################################
#
# 4. Create the document search engine. Remember to add
# client_settings using the above settings.
#
##########################################################################
search_engine = Chroma.from_documents(
client=client, documents=docs, embedding=embeddings, client_settings=client_settings
client=client,
documents=docs,
embedding=embeddings,
client_settings=client_settings,
)
##########################################################################

return search_engine


@cl.on_chat_start
async def on_chat_start():
"""This function is run at every chat session starts to ask user for file,
index it, and build the RAG chain.
Raises:
SystemError: yolo
"""
# Asking user to to upload a PDF to chat with
files = None
while files is None:
Expand All @@ -112,48 +125,43 @@ async def on_chat_start():
msg.content = f"`{file.name}` processed. Loading ..."
await msg.update()

##########################################################################
#
# 3. Set the Encoder model for creating embeddings
#
##########################################################################
# Index documents into search engine
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
##########################################################################
try:
search_engine = await cl.make_async(create_search_engine)(docs=docs, embeddings=embeddings)
search_engine = await cl.make_async(create_search_engine)(
docs=docs, embeddings=embeddings
)
except Exception as e:
await cl.Message(content=f"Error: {e}").send()
raise SystemError
msg.content = f"`{file.name}` loaded. You can now ask questions!"
await msg.update()

llm = ChatOpenAI(model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True)

##########################################################################
#
# 5. Create the chain / tool for RetrievalQAWithSourcesChain.
#
##########################################################################
# RAG Chain
llm = ChatOpenAI(
model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
)
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=search_engine.as_retriever(max_tokens_limit=4097),
######################################################################
# 6. Customize prompts to improve summarization and question
# answering performance. Perhaps create your own prompt in prompts.py?
######################################################################
chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT},
)
##########################################################################

cl.user_session.set("chain", chain)


@cl.on_message
async def main(message: cl.Message):
"""Invoked whenever we receive a Chainlit message.
Args:
message (cl.Message): user input
"""
chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
response = await chain.acall(
message.content, callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)]
message.content,
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
)

answer = response["answer"]
Expand Down
13 changes: 11 additions & 2 deletions app/prompts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# flake8: noqa
from langchain.prompts import PromptTemplate

template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
Expand All @@ -11,9 +11,18 @@
=========
FINAL ANSWER:"""

PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
PROMPT = PromptTemplate(
template=template, input_variables=["summaries", "question"]
)

EXAMPLE_PROMPT = PromptTemplate(
template="Content: {page_content}\nSource: {source}",
input_variables=["page_content", "source"],
)

WELCOME_MESSAGE = """\
Welcome to Introduction to LLM App Development Sample PDF QA Application!
To get started:
1. Upload a PDF or text file
2. Ask any question about the file!
"""
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ isort = "^5.12.0"

[tool.ruff]
target-version = "py310"
line-length = 120
line-length = 80
select = [
"E", "W", # see: https://pypi.org/project/pycodestyle
"F", # see: https://pypi.org/project/pyflakes
Expand Down

0 comments on commit 3de806c

Please sign in to comment.