From 2e9a9e25f1c67e942d2ba503252ba109475c19dc Mon Sep 17 00:00:00 2001 From: bojiang Date: Fri, 27 Sep 2024 15:25:25 +0800 Subject: [PATCH] fix for pixtral --- src/recipe.yaml | 22 ++++++++++++++++++++++ src/vllm-chat/service.py | 32 ++++++++------------------------ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/recipe.yaml b/src/recipe.yaml index 7474fb38..7cec2785 100644 --- a/src/recipe.yaml +++ b/src/recipe.yaml @@ -757,3 +757,25 @@ extra_labels: openllm_alias: 11b-vision model_name: meta-llama/Llama-3.2-11B-Vision-Instruct +'pixtral:12b-240910': + project: vllm-chat + service_config: + name: pixtral + traffic: + timeout: 300 + resources: + gpu: 1 + gpu_type: nvidia-a100-80gb + engine_config: + model: mistral-community/pixtral-12b-240910 + tokenizer_mode: mistral + enable_prefix_caching: true + enable_chunked_prefill: false + limit_mm_per_prompt: + image: 1 + max_model_len: 16384 + extra_labels: + openllm_alias: 12b, 12b-vision + model_name: mistral-community/pixtral-12b-240910 + extra_requirements: + - mistral_common[opencv] diff --git a/src/vllm-chat/service.py b/src/vllm-chat/service.py index 451c47cb..77da20c9 100644 --- a/src/vllm-chat/service.py +++ b/src/vllm-chat/service.py @@ -4,7 +4,7 @@ import os import traceback from argparse import Namespace -from typing import AsyncGenerator, Literal, Optional, Union +from typing import AsyncGenerator, Literal, Optional, Union, Sequence import bentoml import fastapi @@ -29,7 +29,7 @@ class ImageContent(pydantic.BaseModel): class Message(pydantic.BaseModel): role: Literal["system", "user", "assistant"] = "user" - content: list[Union[TextContent, ImageContent]] + content: Sequence[Union[TextContent, ImageContent]] PARAMETER_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") @@ -114,31 +114,15 @@ def __init__(self) -> None: init_app_state(self.engine, model_config, openai_api_app.state, args) @bentoml.api - async def generate( - self, prompt: str = "what is this?" - ) -> AsyncGenerator[str, None]: - from openai import AsyncOpenAI - - client = AsyncOpenAI(base_url="http://127.0.0.1:3000/v1", api_key="dummy") - content = [TextContent(text=prompt)] - message = Message(role="user", content=content) - - try: - completion = await client.chat.completions.create( # type: ignore - model=ENGINE_CONFIG["model"], - messages=[message.model_dump()], # type: ignore - stream=True, - ) - async for chunk in completion: - yield chunk.choices[0].delta.content or "" - except Exception: - yield traceback.format_exc() - # async for text in self.generate_with_image(prompt): - # yield text + async def generate(self, prompt: str = "what is this?") -> AsyncGenerator[str, None]: + async for text in self.generate_with_image(prompt): + yield text @bentoml.api async def generate_with_image( - self, prompt: str = "what is this?", image: Optional[PIL.Image.Image] = None + self, + prompt: str = "what is this?", + image: Optional[PIL.Image.Image] = None, ) -> AsyncGenerator[str, None]: from openai import AsyncOpenAI