langchain-ai · cwlacewe · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/cookbook/multi_modal_RAG_vdms.ipynb b/cookbook/multi_modal_RAG_vdms.ipynb
@@ -42,12 +42,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "a1b9206b08ef626e15b356bf9e031171f7c7eb8f956a2733f196f0109246fe2b\n"
+      "87218928619b1301f3079123d7289b6c527481a72b352788867332568fd2f343\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
    ],
    "source": [
     "! docker run --rm -d -p 55559:55555 --name vdms_rag_nb intellabs/vdms:latest\n",
+    "%pip install --quiet -U vdms\n",
     "\n",
     "# Connect to VDMS Vector Store\n",
     "from langchain_community.vectorstores.vdms import VDMS_Client\n",
@@ -72,10 +74,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! pip install --quiet -U vdms langchain-experimental\n",
-    "\n",
-    "# lock to 0.10.19 due to a persistent bug in more recent versions\n",
-    "! pip install --quiet pdf2image \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml open_clip_torch"
+    "! pip install -q \"onnxruntime==1.17.0\" \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml pillow matplotlib tiktoken open_clip_torch torch"
    ]
   },
   {
@@ -115,11 +114,12 @@
     "import requests\n",
     "\n",
     "# Folder to store pdf and extracted images\n",
-    "datapath = Path(\"./data/multimodal_files\").resolve()\n",
+    "base_datapath = Path(\"./data/multimodal_files\").resolve()\n",
+    "datapath = base_datapath / \"images\"\n",
     "datapath.mkdir(parents=True, exist_ok=True)\n",
     "\n",
     "pdf_url = \"https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\"\n",
-    "pdf_path = str(datapath / pdf_url.split(\"/\")[-1])\n",
+    "pdf_path = str(base_datapath / pdf_url.split(\"/\")[-1])\n",
     "with open(pdf_path, \"wb\") as f:\n",
     "    f.write(requests.get(pdf_url).content)"
    ]
@@ -310,12 +310,21 @@
    "execution_count": 9,
    "id": "4c93fab3-74c4-4f1d-958a-0bc4cdd0797e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
-    "from langchain_community.llms.ollama import Ollama\n",
-    "from langchain_core.messages import HumanMessage\n",
+    "%pip install -Uq langchain-ollama\n",
+    "from langchain_core.messages import HumanMessage, SystemMessage\n",
     "from langchain_core.output_parsers import StrOutputParser\n",
     "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
+    "from langchain_ollama.llms import OllamaLLM\n",
     "\n",
     "\n",
     "def prompt_func(data_dict):\n",
@@ -340,8 +349,8 @@
     "            \"As an expert art critic and historian, your task is to analyze and interpret images, \"\n",
     "            \"considering their historical and cultural significance. Alongside the images, you will be \"\n",
     "            \"provided with related text to offer context. Both will be retrieved from a vectorstore based \"\n",
-    "            \"on user-input keywords. Please convert answers to english and use your extensive knowledge \"\n",
-    "            \"and analytical skills to provide a comprehensive summary that includes:\\n\"\n",
+    "            \"on user-input keywords. Please use your extensive knowledge and analytical skills to provide a \"\n",
+    "            \"comprehensive summary that includes:\\n\"\n",
     "            \"- A detailed description of the visual elements in the image.\\n\"\n",
     "            \"- The historical and cultural context of the image.\\n\"\n",
     "            \"- An interpretation of the image's symbolism and meaning.\\n\"\n",
@@ -359,7 +368,7 @@
     "    \"\"\"Multi-modal RAG chain\"\"\"\n",
     "\n",
     "    # Multi-modal LLM\n",
-    "    llm_model = Ollama(\n",
+    "    llm_model = OllamaLLM(\n",
     "        verbose=True, temperature=0.5, model=\"llava\", base_url=\"http://localhost:11434\"\n",
     "    )\n",
     "\n",
@@ -461,10 +470,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " The image depicts a woman with several children. The woman appears to be of Cherokee heritage, as suggested by the text provided. The image is described as having been initially regretted by the subject, Florence Owens Thompson, due to her feeling that it did not accurately represent her leadership qualities.\n",
-      "The historical and cultural context of the image is tied to the Great Depression and the Dust Bowl, both of which affected the Cherokee people in Oklahoma. The photograph was taken during this period, and its subject, Florence Owens Thompson, was a leader within her community who worked tirelessly to help those affected by these crises.\n",
-      "The image's symbolism and meaning can be interpreted as a representation of resilience and strength in the face of adversity. The woman is depicted with multiple children, which could signify her role as a caregiver and protector during difficult times.\n",
-      "Connections between the image and the related text include Florence Owens Thompson's leadership qualities and her regretted feelings about the photograph. Additionally, the mention of Dorothea Lange, the photographer who took this photo, ties the image to its historical context and the broader narrative of the Great Depression and Dust Bowl in Oklahoma. \n"
+      " The image is a black and white photograph taken by Dorothea Lange in March 1936 as part of the Farm Security Administration's Office of War Information Collection. It depicts a woman, Florence Owens Thompson, a Cherokee from Oklahoma, standing next to her seven children. The photograph has been described as \"Great Photographs\" and is titled \"DESTITUTE PEA PICKERS IN CALIFORNIA. MOTHER OF SEVEN CHILDREN. AGE THIRTY-TWO. NIPOMO, CALIFORNIA.\"\n",
+      "\n",
+      "The woman in the photograph appears to be in a state of poverty and hardship. She is dressed in simple clothing, and her facial expression suggests that she is not happy about having her picture taken. The background of the image shows a barren landscape with no visible signs of prosperity or abundance.\n",
+      "\n",
+      "The related text provides some context for the photograph. It mentions that Florence Owens Thompson initially regretted having her picture taken by Dorothea Lange, as she felt that it did not accurately represent her character. However, her daughter Katherine later said that she was a strong and influential leader in her community.\n",
+      "\n",
+      "The interpretation of the image's symbolism and meaning could be that it serves as a powerful visual representation of the struggles faced by people during the Great Depression. The photograph captures the hardships and poverty experienced by many individuals at that time, and it highlights the resilience and strength of those who were affected by these difficult circumstances.\n",
+      "\n",
+      "In terms of connections between the image and the related text, the photograph serves as a visual complement to the written text, which provides additional information about the subject of the photograph and her community. The text helps to provide a more complete understanding of the context in which the photograph was taken and the impact it had on the people involved. Overall, the combination of the image and the related text offers a comprehensive summary that highlights the historical and cultural significance of this important photograph. \n"
      ]
     }
    ],
@@ -491,6 +505,14 @@
    "source": [
     "! docker kill vdms_rag_nb"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe4a98ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -509,7 +531,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/cookbook/visual_RAG_vdms.ipynb b/cookbook/visual_RAG_vdms.ipynb
@@ -26,7 +26,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2e44b44201c8778b462342ac97f5ccf05a4e02aa8a04505ecde97bf20dcc4cbb\n"
+      "183c5eb067431896e0bd138fbf7f124af6317b522152051fbb0dc977baf3802c\n"
      ]
     }
    ],
@@ -363,7 +363,7 @@
       "\t\tThere are 2 shoppers in this video. Shopper 1 is wearing a plaid shirt and a spectacle. Shopper 2 who is not completely captured in the frame seems to wear a black shirt and is moving away with his back turned towards the camera. There is a shelf towards the right of the camera frame. Shopper 2 is hanging an item back to a hanger and then quickly walks away in a similar fashion as shopper 2. Contents of the nearer side of the shelf with respect to camera seems to be camping lanterns and cleansing agents, arranged at the top. In the middle part of the shelf, various tools including grommets, a pocket saw, candles, and other helpful camping items can be observed. Midway through the shelf contains items which appear to be steel containers and items made up of plastic with red, green, orange, and yellow colors, while those at the bottom are packed in cardboard boxes. Contents at the farther part of the shelf are well stocked and organized but are not glaringly visible.\n",
       "\n",
       "\tMetadata:\n",
-      "\t\t{'fps': 24.0, 'id': 'c6e5f894-b905-46f5-ac9e-4487a9235561', 'total_frames': 120.0, 'video': 'clip16.mp4'}\n",
+      "\t\t{'fps': 24.0, 'total_frames': 120.0, 'video': 'clip16.mp4'}\n",
       "Retrieved Top matching video!\n",
       "\n",
       "\n"
@@ -394,7 +394,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3edf8783e114487ca490d8dec5c46884",
+       "model_id": "62dd5deb78ff4ffdac9f13e7cfda1167",
        "version_major": 2,
        "version_minor": 0
       },
@@ -404,6 +404,13 @@
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:accelerate.big_modeling:Some parameters are on the meta device because they were offloaded to the cpu.\n"
+     ]
     }
    ],
    "source": [
@@ -555,7 +562,7 @@
       "\t\tA single shopper is seen in this video standing facing the shelf and in the bottom part of the frame. He's wearing a light-colored shirt and a spectacle. The shopper is carrying a red colored basket in his left hand. The entire basket is not clearly visible, but it does seem to contain something in a blue colored package which the shopper has just placed in the basket given his right hand was seen inside the basket. Then the shopper leans towards the shelf and checks out an item in orange package. He picks this single item with his right hand and proceeds to place the item in the basket. The entire shelf looks well stocked except for the top part of the shelf which is empty. The shopper has not picked any item from this part of the shelf. The rest of the shelf looks well stocked and does not need any restocking. The contents on the farther part of the shelf consists of items, majority of which are packed in black, yellow, and green packages. No other details are visible of these items.\n",
       "\n",
       "\tMetadata:\n",
-      "\t\t{'fps': 24.0, 'id': '37ddc212-994e-4db0-877f-5ed09965ab90', 'total_frames': 162.0, 'video': 'clip10.mp4'}\n",
+      "\t\t{'fps': 24.0, 'total_frames': 162.0, 'video': 'clip10.mp4'}\n",
       "Retrieved Top matching video!\n",
       "\n",
       "\n"
@@ -585,7 +592,7 @@
       "User :  Find a man holding a red shopping basket\n",
       "Assistant :  Most relevant retrieved video is **clip9.mp4** \n",
       "\n",
-      "I see a person standing in front of a well-stocked shelf, they are wearing a light-colored shirt and glasses, and they have a red shopping basket in their left hand. They are leaning forward and picking up an item from the shelf with their right hand. The item is packaged in a blue-green box. Based on the scene description, I can confirm that the person is indeed holding a red shopping basket.</s>\n"
+      "I see a person standing in front of a well-stocked shelf, they are wearing a light-colored shirt and glasses, and they have a red shopping basket in their left hand. They are leaning forward and picking up an item from the shelf with their right hand. The item is packaged in a blue-green box. Based on the available information, I cannot confirm whether the basket is empty or contains items. However, the rest of the\n"
      ]
     }
    ],
@@ -655,7 +662,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": ".langchain-venv",
    "language": "python",
    "name": "python3"
   },
@@ -669,7 +676,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/docs/docs/integrations/providers/vdms.mdx b/docs/docs/integrations/providers/vdms.mdx
@@ -18,7 +18,8 @@ There are two ways to get started with VDMS:
 
 #### Install VDMS on your local machine via docker
 ```bash
-    docker run -d -p 55555:55555 intellabs/vdms:latest
+docker pull intellabs/vdms:latest
+docker run -d -p 55555:55555 intellabs/vdms:latest
 ```
 
 #### Install VDMS directly on your local machine
@@ -49,7 +50,7 @@ vectorstore = VDMS.from_documents(
     docs,
     client=client,
     collection_name="langchain-demo",
-    embedding_function=HuggingFaceEmbeddings(model_name=model_name),
+    embedding=HuggingFaceEmbeddings(model_name=model_name),
     engine="FaissFlat"
     distance_strategy="L2",
 )