Merge pull request #2 from leehanchung/lab2/end

feat: update packages to current versions
leehanchung · Nov 26, 2023 · 7c7e163 · 7c7e163
2 parents c9e11b4 + 8613716
commit 7c7e163
Show file tree

Hide file tree

Showing 13 changed files with 2,569 additions and 2,226 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+{
+  "name": "llm-pdf-qa-workshop",
+  "image": "mcr.microsoft.com/vscode/devcontainers/python:3.10-bullseye",
+  "features": {
+    "ghcr.io/devcontainers-contrib/features/pre-commit:2": {
+      "version": "latest"
+    }
+  },
+  "mounts": [
+    // Re-use local git and ssh configurations. On container its at /workspaces/vscode
+    "source=${env:HOME}/.gitconfig,target=/home/vscode/.gitconfig,type=bind",
+    "source=${env:HOME}/.ssh,target=/home/vscode/.ssh,type=bind"
+  ],
+  "postCreateCommand": ".devcontainer/post_create_command.sh",
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "GitHub.copilot",
+        "esbenp.prettier-vscode",
+        "njpwerner.autodocstring",
+        "ms-python.python",
+        "ms-azuretools.vscode-docker"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/post_create_command.sh b/.devcontainer/post_create_command.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -euf -o pipefail
+IFS=$'\n\t'
+
+# Install poetry
+curl -sSL https://install.python-poetry.org | python3 -
+
+# Create python virtual environment
+poetry install --no-root
+poetry shell
diff --git a/.env.sample b/.env.sample
@@ -1,2 +1,7 @@
-OPENAI_ORG_ID="org-your-org-id"
-OPENAI_API_KEY="sk-your-openai-api-key"
+ALLOW_RESET=TRUE
+OPENAI_API_KEY="sk-your-openai-api-key"
+
+LANGCHAIN_TRACING_V2="true"
+LANGCHAIN_ENDPOINT="https://api.langchain.plus"
+LANGCHAIN_API_KEY="ls_your-langsmith-key-here"
+LANGCHAIN_PROJECT=chat-with-pdf
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,14 @@
+name: continous_integration
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+
+      - name: checkout repository
+        uses: actions/checkout@v3
+
+      - name: ruff
+        uses: chartboost/ruff-action@v1
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,30 @@
+# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.231.6/containers/python-3/.devcontainer/base.Dockerfile
+
+# [Choice] Python version (use -bullseye variants on local arm64/Apple Silicon): 3, 3.10, 3.9, 3.8, 3.7, 3.6, 3-bullseye, 3.10-bullseye, 3.9-bullseye, 3.8-bullseye, 3.7-bullseye, 3.6-bullseye, 3-buster, 3.10-buster, 3.9-buster, 3.8-buster, 3.7-buster, 3.6-buster
+ARG VARIANT
+
+FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT}
+
+# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
+ARG NODE_VERSION="none"
+RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
+
+# Poetry
+ARG POETRY_VERSION="none"
+RUN if [ "${POETRY_VERSION}" != "none" ]; then su vscode -c "umask 0002 && pip3 install poetry==${POETRY_VERSION}"; fi
+
+# Nox
+ARG NOX_VERSION="none"
+RUN if [ "${NOX_VERSION}" != "none" ]; then su vscode -c "umask 0002 && pip3 install nox-poetry nox==${NOX_VERSION}"; fi
+
+# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
+# COPY requirements.txt /tmp/pip-tmp/
+# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
+#    && rm -rf /tmp/pip-tmp
+
+# [Optional] Uncomment this section to install additional OS packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
+
+# [Optional] Uncomment this line to install global node packages.
+# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
diff --git a/README.md b/README.md
@@ -66,7 +66,15 @@ Please implement the missing pieces in the [application](app/app.py)
 
 ### Lab 1: Solution
 
-✅ [Solution](https://github.com/leehanchung/llm-pdf-qa-workshop/tree/lab1/pdf-qa-app)
+2. We choose to use [langchain.document_loaders.PDFPlumberLoader](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf#using-pdfplumber) to load PDF files. It helps with PDF file metadata in the future. And we like Super Mario Brothers who are plumbers.
+3. We choose to use [langchain.text_splitter.RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) to chunk the text into smaller documents.
+4. Any in-memory vector stores should be suitable for this application since we are only expecting one single PDF. Anything more is over engineering. We choose to use [Chroma](https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/chroma).
+5. We use [langchain.chains.RetrievalQAWithSourcesChain](https://python.langchain.com/docs/modules/chains/popular/vector_db_qa#return-source-documents) since it returns the sources, which helps the end users to access the source documents.
+
+The completed application has the following architecture:
+![final](assets/arch_final.png)
+
+Now we can [run the application](#run-the-application).
 
 ## Lab 2: Basic prompt engineering
 
@@ -88,7 +96,16 @@ Please resolve this hallucination problem with prompt engineering.
 
 ### Lab 2: Solution
 
-✅ [Solution](https://github.com/leehanchung/llm-pdf-qa-workshop/tree/lab1/pdf-qa-app-final)
+We utilized Chainlit's Prompt Playground functionality to experiment with the prompts. First, we investigates the prompts that includes the retrieved results. We found the correct operating margins is included. So the model is having a difficult time generating summaries using the right context.
+
+We found that if we remove the few shot examples implemented by Langchain, `gpt-3.5-turbo-0613` will be able to generate the right answer. However, it, for some reason, decided to change the sources into bullet points with summaries. We then experimented around and "fixed" the sources prompt.
+
+To implement the updated prompts in our application, we traced Langchain's Python source code. We found that `RetrievalQAWithSourcesChain` inherites from `BaseQAWithSourcesChain`, where it has a class method `from_chain_type()` that uses [`load_qa_with_sources_chain`](https://github.com/hwchase17/langchain/blob/b0859c9b185fe897f3c8e2699835a669b2a2ba61/langchain/chains/qa_with_sources/base.py#L81) to create the chain. The function maps the keyword `stuff` to use [_load_stuff_chain](https://github.com/hwchase17/langchain/blob/b0859c9b185fe897f3c8e2699835a669b2a2ba61/langchain/chains/qa_with_sources/loading.py#L52). We then found that [_load_stuff_chain](https://github.com/hwchase17/langchain/blob/b0859c9b185fe897f3c8e2699835a669b2a2ba61/langchain/chains/qa_with_sources/loading.py#L52) takes a `prompt` variable and a `document_prompt` variable to create a [StuffDocumentChain](https://github.com/hwchase17/langchain/blob/b0859c9b185fe897f3c8e2699835a669b2a2ba61/langchain/chains/combine_documents/stuff.py#L22) for doing the QA as a documentation summarization task.
+
+The composition of the overall prompt is as follows:
+![Alt text](assets/stuff_chain.png)
+
+We then extracted out [the prompts into their own file](app/prompts.py) and implements them there. We then initialize the `RetrievalQAWithSourcesChain` with our custom prompts!
 
 ## LICENSE
 

diff --git a/app/app.py b/app/app.py
@@ -1,17 +1,27 @@
+# Chroma compatibility issues, hacking per its documentation
+# https://docs.trychroma.com/troubleshooting#sqlite
+__import__("pysqlite3")
+import sys
+
+sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+from typing import List
+
 from tempfile import NamedTemporaryFile
 
+import chainlit as cl
+from chainlit.types import AskFileResponse
+import chromadb
+from chromadb.config import Settings
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chat_models import ChatOpenAI
 from langchain.document_loaders import PDFPlumberLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.schema import Document
+from langchain.schema.embeddings import Embeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.vectorstores.base import VectorStore
-from langchain.chains import RetrievalQAWithSourcesChain
-from langchain.chains.base import Chain
-from langchain.chat_models import ChatOpenAI
-
-import chainlit as cl
-from chainlit.types import AskFileResponse
-from chromadb.config import Settings
+from prompts import EXAMPLE_PROMPT, PROMPT
 
 
 WELCOME_MESSAGE = """\
@@ -22,115 +32,132 @@
 """
 
 
-def process_file(*, file: AskFileResponse) -> list:
+def process_file(*, file: AskFileResponse) -> List[Document]:
     if file.type != "application/pdf":
         raise TypeError("Only PDF files are supported")
-    
+
     with NamedTemporaryFile() as tempfile:
         tempfile.write(file.content)
-        
+
         ######################################################################
         #
-        # TODO: 1. Load the PDF
+        # 1. Load the PDF
         #
         ######################################################################
-        loader = ...
+        loader = PDFPlumberLoader(tempfile.name)
         ######################################################################
         documents = loader.load()
 
         ######################################################################
         #
-        # TODO: 2. Split the text
+        # 2. Split the text
         #
         ######################################################################
-        text_splitter = ...
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
         ######################################################################
-
         docs = text_splitter.split_documents(documents)
 
         for i, doc in enumerate(docs):
             doc.metadata["source"] = f"source_{i}"
-        return docs
 
+        if not docs:
+            raise ValueError("PDF file parsing failed.")
 
-def create_search_engine(*, file: AskFileResponse) -> VectorStore:
-    docs = process_file(file=file)
+        return docs
 
-    ##########################################################################
-    #
-    # TODO: 3. Set the Encoder model for creating embeddings
-    #
-    ##########################################################################
-
-    encoder = ...
-    ##########################################################################
-
-    # Save data in the user session
-    cl.user_session.set("docs", docs)
 
+def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:
+    # Initialize Chromadb client to enable resetting and disable telemtry
+    client = chromadb.EphemeralClient()
     client_settings = Settings(
-        chroma_db_impl="duckdb+parquet",
-        anonymized_telemetry=False,
-        persist_directory=".chromadb",
-        allow_reset=True
+        chroma_db_impl="duckdb+parquet", anonymized_telemetry=False, persist_directory=".chromadb", allow_reset=True
     )
 
+    # Reset the search engine to ensure we don't use old copies.
+    # NOTE: we do not need this for production
+    search_engine = Chroma(client=client, client_settings=client_settings)
+    search_engine._client.reset()
+
     ##########################################################################
     #
-    # TODO: 4. Create the document search engine. Remember to add 
+    # 4. Create the document search engine. Remember to add
     # client_settings using the above settings.
     #
     ##########################################################################
-
-    search_engine = ...
+    search_engine = Chroma.from_documents(
+        client=client, documents=docs, embedding=embeddings, client_settings=client_settings
+    )
     ##########################################################################
 
     return search_engine
 
 
-@cl.langchain_factory(use_async=True)
-async def chat() -> Chain:
-
+@cl.on_chat_start
+async def on_chat_start():
+    # Asking user to to upload a PDF to chat with
     files = None
     while files is None:
         files = await cl.AskFileMessage(
             content=WELCOME_MESSAGE,
             accept=["application/pdf"],
             max_size_mb=20,
         ).send()
-
     file = files[0]
+
+    # Process and save data in the user session
     msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
+    docs = process_file(file=file)
+    cl.user_session.set("docs", docs)
+    msg.content = f"`{file.name}` processed. Loading ..."
+    await msg.update()
 
+    ##########################################################################
+    #
+    # 3. Set the Encoder model for creating embeddings
+    #
+    ##########################################################################
+    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+    ##########################################################################
     try:
-        search_engine = await cl.make_async(create_search_engine)(file=file)
+        search_engine = await cl.make_async(create_search_engine)(docs=docs, embeddings=embeddings)
     except Exception as e:
         await cl.Message(content=f"Error: {e}").send()
+        raise SystemError
+    msg.content = f"`{file.name}` loaded. You can now ask questions!"
+    await msg.update()
 
-    llm = ChatOpenAI(
-        model='gpt-3.5-turbo-16k-0613',
-        temperature=0,
-        streaming=True
-    )
+    llm = ChatOpenAI(model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True)
 
     ##########################################################################
     #
-    # TODO: 5. Create the chain / tool for RetrievalQAWithSourcesChain.
+    # 5. Create the chain / tool for RetrievalQAWithSourcesChain.
     #
     ##########################################################################
-    chain = ...
+    chain = RetrievalQAWithSourcesChain.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=search_engine.as_retriever(max_tokens_limit=4097),
+        ######################################################################
+        # 6. Customize prompts to improve summarization and question
+        # answering performance. Perhaps create your own prompt in prompts.py?
+        ######################################################################
+        chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT},
+    )
     ##########################################################################
 
-    await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
+    cl.user_session.set("chain", chain)
 
-    return chain
 
+@cl.on_message
+async def main(message: cl.Message):
+    chain = cl.user_session.get("chain")  # type: RetrievalQAWithSourcesChain
+    response = await chain.acall(
+        message.content, callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)]
+    )
 
-@cl.langchain_postprocess
-async def process_response(res):
-    answer = res["answer"]
-    sources = res["sources"].strip()
+    answer = response["answer"]
+    sources = response["sources"].strip()
     source_elements = []
 
     # Get the documents from the user session

diff --git a/app/prompts.py b/app/prompts.py
@@ -0,0 +1,19 @@
+# flake8: noqa
+from langchain.prompts import PromptTemplate
+
+template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
+
+QUESTION: {question}
+=========
+{summaries}
+=========
+FINAL ANSWER:"""
+
+PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
+
+EXAMPLE_PROMPT = PromptTemplate(
+    template="Content: {page_content}\nSource: {source}",
+    input_variables=["page_content", "source"],
+)
diff --git a/assets/arch_final.png b/assets/arch_final.png
diff --git a/assets/stuff_chain.png b/assets/stuff_chain.png