Skip to content

Commit

Permalink
Update all bentos
Browse files Browse the repository at this point in the history
  • Loading branch information
rickzx committed Jun 21, 2024
1 parent abe3658 commit d6f44d3
Show file tree
Hide file tree
Showing 2,226 changed files with 6,426 additions and 1,806 deletions.
89 changes: 73 additions & 16 deletions bentoml/bentos/gemma/2b-instruct-fp16/apis/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ components:
- type: 'null'
title: Content
function_call:
anyOf:
- $ref: '#/components/schemas/FunctionCall'
- type: 'null'
$ref: '#/components/schemas/FunctionCall'
name:
title: Name
type: string
Expand Down Expand Up @@ -230,6 +228,7 @@ components:
- $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
- $ref: '#/components/schemas/ChatCompletionToolMessageParam'
- $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
- $ref: '#/components/schemas/CustomChatCompletionMessageParam'
title: Messages
type: array
min_p:
Expand Down Expand Up @@ -325,6 +324,7 @@ components:
anyOf:
- type: integer
- type: 'null'
default: 0
title: Top Logprobs
top_p:
anyOf:
Expand Down Expand Up @@ -396,6 +396,7 @@ components:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
Expand Down Expand Up @@ -657,6 +658,39 @@ components:
- prompt
title: CompletionRequest
type: object
CustomChatCompletionContentPartParam:
additionalProperties: true
properties:
type:
title: Type
type: string
required:
- type
title: CustomChatCompletionContentPartParam
type: object
CustomChatCompletionMessageParam:
description: Enables custom roles in the Chat Completion API.
properties:
content:
anyOf:
- type: string
- items:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
title: Name
type: string
role:
title: Role
type: string
required:
- role
title: CustomChatCompletionMessageParam
type: object
Function:
properties:
arguments:
Expand Down Expand Up @@ -868,16 +902,6 @@ info:
version: None
openapi: 3.0.2
paths:
/:
get:
operationId: index__get
responses:
'200':
content:
application/json:
schema: {}
description: Successful Response
summary: Index
/api/chat:
post:
description: "\n light-weight chat API that takes in a list of messages\
Expand Down Expand Up @@ -1011,6 +1035,39 @@ paths:
tags:
- Service APIs
x-bentoml-name: generate
/chat/:
get:
operationId: serve_chat_html__get
responses:
'200':
content:
application/json:
schema: {}
description: Successful Response
summary: Serve Chat Html
/chat/{full_path}:
get:
operationId: catch_all__full_path__get
parameters:
- in: path
name: full_path
required: true
schema:
title: Full Path
type: string
responses:
'200':
content:
application/json:
schema: {}
description: Successful Response
'422':
content:
application/json:
schema:
$ref: '#/components/schemas/HTTPValidationError'
description: Validation Error
summary: Catch All
/healthz:
get:
description: Health check endpoint. Expecting an empty response with status
Expand Down Expand Up @@ -1053,7 +1110,7 @@ paths:
- Infrastructure
/v1/chat/completions:
post:
operationId: create_chat_completion_v1_chat_completions_post
operationId: create_chat_completion_chat_completions_post
requestBody:
content:
application/json:
Expand All @@ -1075,7 +1132,7 @@ paths:
summary: Create Chat Completion
/v1/completions:
post:
operationId: create_completion_v1_completions_post
operationId: create_completion_completions_post
requestBody:
content:
application/json:
Expand All @@ -1097,7 +1154,7 @@ paths:
summary: Create Completion
/v1/models:
get:
operationId: show_available_models_v1_models_get
operationId: show_available_models_models_get
responses:
'200':
content:
Expand Down
3 changes: 2 additions & 1 deletion bentoml/bentos/gemma/2b-instruct-fp16/bento.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ service: service:VLLM
name: gemma
version: 2b-instruct-fp16
bentoml_version: 1.2.17
creation_time: '2024-06-13T09:12:44.075970+00:00'
creation_time: '2024-06-21T07:57:18.133866+00:00'
labels:
owner: bentoml-team
platforms: linux
source_repo: https://github.com/bentoml/openllm-repo-recipe.git
source_directory: vllm-chat
service_home: /chat
models: []
runners: []
entry_service: gemma
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ WORKDIR $BENTO_PATH
# Block SETUP_BENTO_COMPONENTS

RUN pip3 install torch==2.3.0 ; exit 0
RUN pip3 install vllm==0.4.2 ; exit 0
RUN pip3 install vllm==0.4.3 ; exit 0
COPY --chown=bentoml:bentoml ./env/python ./env/python/
# install python packages with install.sh
RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
bentoml==1.2.17
torch==2.3.0
vllm==0.4.2
vllm==0.4.3
numpy==1.26.0
transformers==4.41.0
pyyaml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.11.9
3.11.7
41 changes: 28 additions & 13 deletions bentoml/bentos/gemma/2b-instruct-fp16/src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import yaml
from annotated_types import Ge, Le
from bento_constants import CONSTANT_YAML
from fastapi.responses import FileResponse
from typing_extensions import Annotated

CONSTANTS = yaml.safe_load(CONSTANT_YAML)
Expand All @@ -26,12 +27,14 @@


openai_api_app = fastapi.FastAPI()
static_app = fastapi.FastAPI()
ui_app = fastapi.FastAPI()


OPENAI_ENDPOINTS = [
["/v1/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
["/v1/completions", vllm_api_server.create_completion, ["POST"]],
["/v1/models", vllm_api_server.show_available_models, ["GET"]],
["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
["/completions", vllm_api_server.create_completion, ["POST"]],
["/models", vllm_api_server.show_available_models, ["GET"]],
]


Expand All @@ -45,31 +48,40 @@


STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui")
INDEX_HTML = os.path.join(os.path.dirname(__file__), "ui", "index.html")

ui_app.mount(
"/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static"
)

openai_api_app.mount("/_next", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR))

@ui_app.get("/")
async def serve_chat_html():
return FileResponse(os.path.join(STATIC_DIR, "chat.html"))

@openai_api_app.get("/")
async def index():
with open(INDEX_HTML) as f:
return fastapi.responses.HTMLResponse(content=f.read())

@ui_app.get("/{full_path:path}")
async def catch_all(full_path: str):
file_path = os.path.join(STATIC_DIR, full_path)
if os.path.exists(file_path):
return FileResponse(file_path)
return FileResponse(os.path.join(STATIC_DIR, "chat.html"))


# special handling for prometheus_client of bentoml
if "prometheus_client" in sys.modules:
sys.modules.pop("prometheus_client")


@bentoml.mount_asgi_app(openai_api_app)
@bentoml.mount_asgi_app(openai_api_app, path="/v1")
@bentoml.mount_asgi_app(ui_app, path="/chat")
@bentoml.service(**SERVICE_CONFIG)
class VLLM:
def __init__(self) -> None:
from transformers import AutoTokenizer
from vllm import AsyncEngineArgs, AsyncLLMEngine
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_completion import \
OpenAIServingCompletion

ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG)
self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
Expand All @@ -82,18 +94,21 @@ def __init__(self) -> None:
else:
chat_template = None

model_config = self.engine.engine.get_model_config()

# inject the engine into the openai serving chat and completion
vllm_api_server.openai_serving_chat = OpenAIServingChat(
engine=self.engine,
served_model_names=[ENGINE_CONFIG["model"]],
response_role="assistant",
chat_template=chat_template,
# args.lora_modules,
model_config=model_config,
)
vllm_api_server.openai_serving_completion = OpenAIServingCompletion(
engine=self.engine,
served_model_names=[ENGINE_CONFIG["model"]],
# args.lora_modules,
model_config=model_config,
lora_modules=None,
)

@bentoml.api(route="/api/generate")
Expand Down
Loading

0 comments on commit d6f44d3

Please sign in to comment.