From 2e9a9e25f1c67e942d2ba503252ba109475c19dc Mon Sep 17 00:00:00 2001
From: bojiang <bojiang_@outlook.com>
Date: Fri, 27 Sep 2024 15:25:25 +0800
Subject: [PATCH] fix for pixtral

---
 src/recipe.yaml          | 22 ++++++++++++++++++++++
 src/vllm-chat/service.py | 32 ++++++++------------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/src/recipe.yaml b/src/recipe.yaml
index 7474fb38..7cec2785 100644
--- a/src/recipe.yaml
+++ b/src/recipe.yaml
@@ -757,3 +757,25 @@
   extra_labels:
     openllm_alias: 11b-vision
     model_name: meta-llama/Llama-3.2-11B-Vision-Instruct
+'pixtral:12b-240910':
+  project: vllm-chat
+  service_config:
+    name: pixtral
+    traffic:
+      timeout: 300
+    resources:
+      gpu: 1
+      gpu_type: nvidia-a100-80gb
+  engine_config:
+    model: mistral-community/pixtral-12b-240910
+    tokenizer_mode: mistral
+    enable_prefix_caching: true
+    enable_chunked_prefill: false
+    limit_mm_per_prompt:
+      image: 1
+    max_model_len: 16384
+  extra_labels:
+    openllm_alias: 12b, 12b-vision
+    model_name: mistral-community/pixtral-12b-240910
+  extra_requirements:
+    - mistral_common[opencv]
diff --git a/src/vllm-chat/service.py b/src/vllm-chat/service.py
index 451c47cb..77da20c9 100644
--- a/src/vllm-chat/service.py
+++ b/src/vllm-chat/service.py
@@ -4,7 +4,7 @@
 import os
 import traceback
 from argparse import Namespace
-from typing import AsyncGenerator, Literal, Optional, Union
+from typing import AsyncGenerator, Literal, Optional, Union, Sequence
 
 import bentoml
 import fastapi
@@ -29,7 +29,7 @@ class ImageContent(pydantic.BaseModel):
 
 class Message(pydantic.BaseModel):
     role: Literal["system", "user", "assistant"] = "user"
-    content: list[Union[TextContent, ImageContent]]
+    content: Sequence[Union[TextContent, ImageContent]]
 
 
 PARAMETER_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml")
@@ -114,31 +114,15 @@ def __init__(self) -> None:
         init_app_state(self.engine, model_config, openai_api_app.state, args)
 
     @bentoml.api
-    async def generate(
-        self, prompt: str = "what is this?"
-    ) -> AsyncGenerator[str, None]:
-        from openai import AsyncOpenAI
-
-        client = AsyncOpenAI(base_url="http://127.0.0.1:3000/v1", api_key="dummy")
-        content = [TextContent(text=prompt)]
-        message = Message(role="user", content=content)
-
-        try:
-            completion = await client.chat.completions.create(  # type: ignore
-                model=ENGINE_CONFIG["model"],
-                messages=[message.model_dump()],  # type: ignore
-                stream=True,
-            )
-            async for chunk in completion:
-                yield chunk.choices[0].delta.content or ""
-        except Exception:
-            yield traceback.format_exc()
-        # async for text in self.generate_with_image(prompt):
-        #     yield text
+    async def generate(self, prompt: str = "what is this?") -> AsyncGenerator[str, None]:
+        async for text in self.generate_with_image(prompt):
+            yield text
 
     @bentoml.api
     async def generate_with_image(
-        self, prompt: str = "what is this?", image: Optional[PIL.Image.Image] = None
+        self,
+        prompt: str = "what is this?",
+        image: Optional[PIL.Image.Image] = None,
     ) -> AsyncGenerator[str, None]:
         from openai import AsyncOpenAI