diff --git a/app/app.py b/app/app.py index d958686..34e2516 100644 --- a/app/app.py +++ b/app/app.py @@ -21,40 +21,36 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.vectorstores.base import VectorStore -from prompts import EXAMPLE_PROMPT, PROMPT +from prompts import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE -WELCOME_MESSAGE = """\ -Welcome to Introduction to LLM App Development Sample PDF QA Application! -To get started: -1. Upload a PDF or text file -2. Ask any question about the file! -""" +def process_file(*, file: AskFileResponse) -> List[Document]: + """Takes a Chailit AskFileResponse, get the document and process and chunk + it into a list of Langchain's Documents. Each Document has page_content and + matadata fields. Supports PDF files only. + Args: + file (AskFileResponse): User's file input -def process_file(*, file: AskFileResponse) -> List[Document]: + Raises: + TypeError: when the file type is not pdf + ValueError: when the PDF is not parseable + + Returns: + List[Document]: chunked documents + """ if file.type != "application/pdf": raise TypeError("Only PDF files are supported") with NamedTemporaryFile() as tempfile: tempfile.write(file.content) - ###################################################################### - # - # 1. Load the PDF - # - ###################################################################### loader = PDFPlumberLoader(tempfile.name) - ###################################################################### documents = loader.load() - ###################################################################### - # - # 2. Split the text - # - ###################################################################### - text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) - ###################################################################### + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=2000, chunk_overlap=100 + ) docs = text_splitter.split_documents(documents) for i, doc in enumerate(docs): @@ -66,11 +62,26 @@ def process_file(*, file: AskFileResponse) -> List[Document]: return docs -def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore: +def create_search_engine( + *, docs: List[Document], embeddings: Embeddings +) -> VectorStore: + """Takes a list of Langchain Documents and an Langchain embeddings wrapper + over encoder models, and index the data into a ChromaDB as a search engine + + Args: + docs (List[Document]): list of documents to be ingested + embeddings (Embeddings): encoder model + + Returns: + VectorStore: vector store for RAG + """ # Initialize Chromadb client to enable resetting and disable telemtry client = chromadb.EphemeralClient() client_settings = Settings( - chroma_db_impl="duckdb+parquet", anonymized_telemetry=False, persist_directory=".chromadb", allow_reset=True + chroma_db_impl="duckdb+parquet", + anonymized_telemetry=False, + persist_directory=".chromadb", + allow_reset=True, ) # Reset the search engine to ensure we don't use old copies. @@ -78,22 +89,24 @@ def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> Vec search_engine = Chroma(client=client, client_settings=client_settings) search_engine._client.reset() - ########################################################################## - # - # 4. Create the document search engine. Remember to add - # client_settings using the above settings. - # - ########################################################################## search_engine = Chroma.from_documents( - client=client, documents=docs, embedding=embeddings, client_settings=client_settings + client=client, + documents=docs, + embedding=embeddings, + client_settings=client_settings, ) - ########################################################################## return search_engine @cl.on_chat_start async def on_chat_start(): + """This function is run at every chat session starts to ask user for file, + index it, and build the RAG chain. + + Raises: + SystemError: yolo + """ # Asking user to to upload a PDF to chat with files = None while files is None: @@ -112,48 +125,43 @@ async def on_chat_start(): msg.content = f"`{file.name}` processed. Loading ..." await msg.update() - ########################################################################## - # - # 3. Set the Encoder model for creating embeddings - # - ########################################################################## + # Index documents into search engine embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") - ########################################################################## try: - search_engine = await cl.make_async(create_search_engine)(docs=docs, embeddings=embeddings) + search_engine = await cl.make_async(create_search_engine)( + docs=docs, embeddings=embeddings + ) except Exception as e: await cl.Message(content=f"Error: {e}").send() raise SystemError msg.content = f"`{file.name}` loaded. You can now ask questions!" await msg.update() - llm = ChatOpenAI(model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True) - - ########################################################################## - # - # 5. Create the chain / tool for RetrievalQAWithSourcesChain. - # - ########################################################################## + # RAG Chain + llm = ChatOpenAI( + model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True + ) chain = RetrievalQAWithSourcesChain.from_chain_type( llm=llm, chain_type="stuff", retriever=search_engine.as_retriever(max_tokens_limit=4097), - ###################################################################### - # 6. Customize prompts to improve summarization and question - # answering performance. Perhaps create your own prompt in prompts.py? - ###################################################################### chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT}, ) - ########################################################################## cl.user_session.set("chain", chain) @cl.on_message async def main(message: cl.Message): + """Invoked whenever we receive a Chainlit message. + + Args: + message (cl.Message): user input + """ chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain response = await chain.acall( - message.content, callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)] + message.content, + callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)], ) answer = response["answer"] diff --git a/app/prompts.py b/app/prompts.py index 0418526..a95d67c 100644 --- a/app/prompts.py +++ b/app/prompts.py @@ -1,7 +1,7 @@ # flake8: noqa from langchain.prompts import PromptTemplate -template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). +template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). If you don't know the answer, just say that you don't know. Don't try to make up an answer. ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: , , , ...". @@ -11,9 +11,18 @@ ========= FINAL ANSWER:""" -PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"]) +PROMPT = PromptTemplate( + template=template, input_variables=["summaries", "question"] +) EXAMPLE_PROMPT = PromptTemplate( template="Content: {page_content}\nSource: {source}", input_variables=["page_content", "source"], ) + +WELCOME_MESSAGE = """\ +Welcome to Introduction to LLM App Development Sample PDF QA Application! +To get started: +1. Upload a PDF or text file +2. Ask any question about the file! +""" diff --git a/pyproject.toml b/pyproject.toml index 4cfebec..ceb0d7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ isort = "^5.12.0" [tool.ruff] target-version = "py310" -line-length = 120 +line-length = 80 select = [ "E", "W", # see: https://pypi.org/project/pycodestyle "F", # see: https://pypi.org/project/pyflakes