Update all bentos

bentoml · Jun 21, 2024 · d6f44d3 · d6f44d3
1 parent abe3658
commit d6f44d3
Show file tree

Hide file tree

Showing 2,226 changed files with 6,426 additions and 1,806 deletions.
diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/apis/openapi.yaml b/bentoml/bentos/gemma/2b-instruct-fp16/apis/openapi.yaml
@@ -12,9 +12,7 @@ components:
           - type: 'null'
           title: Content
         function_call:
-          anyOf:
-          - $ref: '#/components/schemas/FunctionCall'
-          - type: 'null'
+          $ref: '#/components/schemas/FunctionCall'
         name:
           title: Name
           type: string
@@ -230,6 +228,7 @@ components:
             - $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
             - $ref: '#/components/schemas/ChatCompletionToolMessageParam'
             - $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
+            - $ref: '#/components/schemas/CustomChatCompletionMessageParam'
           title: Messages
           type: array
         min_p:
@@ -325,6 +324,7 @@ components:
           anyOf:
           - type: integer
           - type: 'null'
+          default: 0
           title: Top Logprobs
         top_p:
           anyOf:
@@ -396,6 +396,7 @@ components:
               anyOf:
               - $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
               - $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
+              - $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
             type: array
           title: Content
         name:
@@ -657,6 +658,39 @@ components:
       - prompt
       title: CompletionRequest
       type: object
+    CustomChatCompletionContentPartParam:
+      additionalProperties: true
+      properties:
+        type:
+          title: Type
+          type: string
+      required:
+      - type
+      title: CustomChatCompletionContentPartParam
+      type: object
+    CustomChatCompletionMessageParam:
+      description: Enables custom roles in the Chat Completion API.
+      properties:
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+              - $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
+              - $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
+            type: array
+          title: Content
+        name:
+          title: Name
+          type: string
+        role:
+          title: Role
+          type: string
+      required:
+      - role
+      title: CustomChatCompletionMessageParam
+      type: object
     Function:
       properties:
         arguments:
@@ -868,16 +902,6 @@ info:
   version: None
 openapi: 3.0.2
 paths:
-  /:
-    get:
-      operationId: index__get
-      responses:
-        '200':
-          content:
-            application/json:
-              schema: {}
-          description: Successful Response
-      summary: Index
   /api/chat:
     post:
       description: "\n        light-weight chat API that takes in a list of messages\
@@ -1011,6 +1035,39 @@ paths:
       tags:
       - Service APIs
       x-bentoml-name: generate
+  /chat/:
+    get:
+      operationId: serve_chat_html__get
+      responses:
+        '200':
+          content:
+            application/json:
+              schema: {}
+          description: Successful Response
+      summary: Serve Chat Html
+  /chat/{full_path}:
+    get:
+      operationId: catch_all__full_path__get
+      parameters:
+      - in: path
+        name: full_path
+        required: true
+        schema:
+          title: Full Path
+          type: string
+      responses:
+        '200':
+          content:
+            application/json:
+              schema: {}
+          description: Successful Response
+        '422':
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+          description: Validation Error
+      summary: Catch All
   /healthz:
     get:
       description: Health check endpoint. Expecting an empty response with status
@@ -1053,7 +1110,7 @@ paths:
       - Infrastructure
   /v1/chat/completions:
     post:
-      operationId: create_chat_completion_v1_chat_completions_post
+      operationId: create_chat_completion_chat_completions_post
       requestBody:
         content:
           application/json:
@@ -1075,7 +1132,7 @@ paths:
       summary: Create Chat Completion
   /v1/completions:
     post:
-      operationId: create_completion_v1_completions_post
+      operationId: create_completion_completions_post
       requestBody:
         content:
           application/json:
@@ -1097,7 +1154,7 @@ paths:
       summary: Create Completion
   /v1/models:
     get:
-      operationId: show_available_models_v1_models_get
+      operationId: show_available_models_models_get
       responses:
         '200':
           content:

diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/bento.yaml b/bentoml/bentos/gemma/2b-instruct-fp16/bento.yaml
@@ -2,12 +2,13 @@ service: service:VLLM
 name: gemma
 version: 2b-instruct-fp16
 bentoml_version: 1.2.17
-creation_time: '2024-06-13T09:12:44.075970+00:00'
+creation_time: '2024-06-21T07:57:18.133866+00:00'
 labels:
   owner: bentoml-team
   platforms: linux
   source_repo: https://github.com/bentoml/openllm-repo-recipe.git
   source_directory: vllm-chat
+  service_home: /chat
 models: []
 runners: []
 entry_service: gemma

diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/env/docker/Dockerfile b/bentoml/bentos/gemma/2b-instruct-fp16/env/docker/Dockerfile
@@ -44,7 +44,7 @@ WORKDIR $BENTO_PATH
 # Block SETUP_BENTO_COMPONENTS
 
 RUN pip3 install torch==2.3.0 ; exit 0
-RUN pip3 install vllm==0.4.2 ; exit 0
+RUN pip3 install vllm==0.4.3 ; exit 0
 COPY --chown=bentoml:bentoml ./env/python ./env/python/
 # install python packages with install.sh
 RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh

diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/env/python/requirements.txt b/bentoml/bentos/gemma/2b-instruct-fp16/env/python/requirements.txt
@@ -1,5 +1,6 @@
 bentoml==1.2.17
 torch==2.3.0
-vllm==0.4.2
+vllm==0.4.3
+numpy==1.26.0
 transformers==4.41.0
 pyyaml
diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/env/python/version.txt b/bentoml/bentos/gemma/2b-instruct-fp16/env/python/version.txt
@@ -1 +1 @@
-3.11.9
+3.11.7
diff --git a/bentoml/bentos/gemma/2b-instruct-fp16/src/service.py b/bentoml/bentos/gemma/2b-instruct-fp16/src/service.py
@@ -13,6 +13,7 @@
 import yaml
 from annotated_types import Ge, Le
 from bento_constants import CONSTANT_YAML
+from fastapi.responses import FileResponse
 from typing_extensions import Annotated
 
 CONSTANTS = yaml.safe_load(CONSTANT_YAML)
@@ -26,12 +27,14 @@
 
 
 openai_api_app = fastapi.FastAPI()
+static_app = fastapi.FastAPI()
+ui_app = fastapi.FastAPI()
 
 
 OPENAI_ENDPOINTS = [
-    ["/v1/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
-    ["/v1/completions", vllm_api_server.create_completion, ["POST"]],
-    ["/v1/models", vllm_api_server.show_available_models, ["GET"]],
+    ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
+    ["/completions", vllm_api_server.create_completion, ["POST"]],
+    ["/models", vllm_api_server.show_available_models, ["GET"]],
 ]
 
 
@@ -45,31 +48,40 @@
 
 
 STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui")
-INDEX_HTML = os.path.join(os.path.dirname(__file__), "ui", "index.html")
 
+ui_app.mount(
+    "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static"
+)
 
-openai_api_app.mount("/_next", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR))
 
+@ui_app.get("/")
+async def serve_chat_html():
+    return FileResponse(os.path.join(STATIC_DIR, "chat.html"))
 
-@openai_api_app.get("/")
-async def index():
-    with open(INDEX_HTML) as f:
-        return fastapi.responses.HTMLResponse(content=f.read())
+
+@ui_app.get("/{full_path:path}")
+async def catch_all(full_path: str):
+    file_path = os.path.join(STATIC_DIR, full_path)
+    if os.path.exists(file_path):
+        return FileResponse(file_path)
+    return FileResponse(os.path.join(STATIC_DIR, "chat.html"))
 
 
 # special handling for prometheus_client of bentoml
 if "prometheus_client" in sys.modules:
     sys.modules.pop("prometheus_client")
 
 
-@bentoml.mount_asgi_app(openai_api_app)
+@bentoml.mount_asgi_app(openai_api_app, path="/v1")
+@bentoml.mount_asgi_app(ui_app, path="/chat")
 @bentoml.service(**SERVICE_CONFIG)
 class VLLM:
     def __init__(self) -> None:
         from transformers import AutoTokenizer
         from vllm import AsyncEngineArgs, AsyncLLMEngine
         from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-        from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+        from vllm.entrypoints.openai.serving_completion import \
+            OpenAIServingCompletion
 
         ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG)
         self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
@@ -82,18 +94,21 @@ def __init__(self) -> None:
         else:
             chat_template = None
 
+        model_config = self.engine.engine.get_model_config()
+
         # inject the engine into the openai serving chat and completion
         vllm_api_server.openai_serving_chat = OpenAIServingChat(
             engine=self.engine,
             served_model_names=[ENGINE_CONFIG["model"]],
             response_role="assistant",
             chat_template=chat_template,
-            # args.lora_modules,
+            model_config=model_config,
         )
         vllm_api_server.openai_serving_completion = OpenAIServingCompletion(
             engine=self.engine,
             served_model_names=[ENGINE_CONFIG["model"]],
-            # args.lora_modules,
+            model_config=model_config,
+            lora_modules=None,
         )
 
     @bentoml.api(route="/api/generate")