Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
bojiang committed Jul 3, 2024
1 parent 2514be6 commit c8bc7c3
Show file tree
Hide file tree
Showing 1,996 changed files with 14,392 additions and 8,217 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# llama2:7b-chat-fp16-50fcbe9
# gemma:2b-instruct-fp16-ad2d

[![pypi_status](https://img.shields.io/badge/BentoML-1.2.19-informational)](https://pypi.org/project/BentoML)
[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ components:
- $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
- $ref: '#/components/schemas/ChatCompletionToolMessageParam'
- $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
- $ref: '#/components/schemas/CustomChatCompletionMessageParam'
title: Messages
type: array
min_p:
Expand Down Expand Up @@ -325,6 +326,7 @@ components:
anyOf:
- type: integer
- type: 'null'
default: 0
title: Top Logprobs
top_p:
anyOf:
Expand Down Expand Up @@ -396,6 +398,7 @@ components:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
Expand Down Expand Up @@ -657,6 +660,39 @@ components:
- prompt
title: CompletionRequest
type: object
CustomChatCompletionContentPartParam:
additionalProperties: true
properties:
type:
title: Type
type: string
required:
- type
title: CustomChatCompletionContentPartParam
type: object
CustomChatCompletionMessageParam:
description: Enables custom roles in the Chat Completion API.
properties:
content:
anyOf:
- type: string
- items:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
title: Name
type: string
role:
title: Role
type: string
required:
- role
title: CustomChatCompletionMessageParam
type: object
Function:
properties:
arguments:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
service: service:VLLM
name: gemma
version: 2b-instruct-fp16-0f34ff9
version: 2b-instruct-fp16-ad2d
bentoml_version: 1.2.19
creation_time: '2024-07-02T14:28:31.122616+00:00'
creation_time: '2024-07-03T09:42:28.113622+00:00'
labels:
openllm_alias: 2b,2b-instruct
openllm_hf_model_id: google/gemma-2b-it
owner: bentoml-team
platforms: linux
service_home: /chat
source_directory: vllm-chat
source_repo: https://github.com/bentoml/openllm-repo-recipe.git
source: https://github.com/bentoml/openllm-repo-recipe/tree/main/vllm-chat
models: []
runners: []
entry_service: gemma
Expand Down Expand Up @@ -118,7 +116,7 @@ schema:
apis: []
docker:
distro: debian
python_version: '3.11'
python_version: '3.9'
cuda_version: null
env:
HF_TOKEN: ''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# ===========================================

# Block SETUP_BENTO_BASE_IMAGE
FROM python:3.11-slim as base-container
FROM python:3.9-slim as base-container

ENV LANG=C.UTF-8

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ annotated-types==0.7.0
anyio==4.4.0
appdirs==1.4.4
asgiref==3.8.1
async-timeout==4.0.3
attrs==23.2.0
bentoml==1.2.19
build==1.2.1
Expand All @@ -21,6 +22,7 @@ diskcache==5.6.3
distro==1.9.0
dnspython==2.6.1
email-validator==2.2.0
exceptiongroup==1.2.1
fastapi==0.111.0
fastapi-cli==0.0.4
filelock==3.15.4
Expand Down Expand Up @@ -51,7 +53,7 @@ mpmath==1.3.0
msgpack==1.0.8
multidict==6.0.5
nest-asyncio==1.6.0
networkx==3.3
networkx==3.2.1
ninja==1.11.1.1
numba==0.60.0
numpy==1.26.0
Expand All @@ -68,15 +70,15 @@ nvidia-ml-py==11.525.150
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.5.82
nvidia-nvtx-cu12==12.1.105
openai==1.35.7
openai==1.35.9
opentelemetry-api==1.20.0
opentelemetry-instrumentation==0.41b0
opentelemetry-instrumentation-aiohttp-client==0.41b0
opentelemetry-instrumentation-asgi==0.41b0
opentelemetry-sdk==1.20.0
opentelemetry-semantic-conventions==0.41b0
opentelemetry-util-http==0.41b0
orjson==3.10.5
orjson==3.10.6
outlines==0.0.34
packaging==24.1
pathspec==0.12.1
Expand Down Expand Up @@ -106,7 +108,7 @@ rich==13.7.1
rpds-py==0.18.1
safetensors==0.4.3
schema==0.7.7
scipy==1.14.0
scipy==1.13.1
sentencepiece==0.2.0
shellingham==1.5.4
simple-di==0.1.5
Expand All @@ -116,6 +118,7 @@ starlette==0.37.2
sympy==1.12.1
tiktoken==0.7.0
tokenizers==0.19.1
tomli==2.0.1
tomli-w==1.0.0
torch==2.3.0
tornado==6.4.1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.9.19
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ docker:
dockerfile_template: null
env:
HF_TOKEN: ''
python_version: '3.11'
python_version: '3.9'
setup_script: null
system_packages: null
envs:
Expand All @@ -32,9 +32,7 @@ labels:
openllm_hf_model_id: google/gemma-2b-it
owner: bentoml-team
platforms: linux
service_home: /chat
source_directory: vllm-chat
source_repo: https://github.com/bentoml/openllm-repo-recipe.git
source: https://github.com/bentoml/openllm-repo-recipe/tree/main/vllm-chat
models: []
name: null
python:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,49 +170,55 @@ async def chat(
"""
from vllm import SamplingParams

if OVERRIDE_CHAT_TEMPLATE: # community chat template
gen_config = _get_gen_config(CONSTANTS["chat_template"])
if not stop:
if gen_config["stop_str"]:
stop = [gen_config["stop_str"]]
else:
stop = []
system_prompt = gen_config["system_prompt"]
self.tokenizer.chat_template = gen_config["template"]
else:
if not stop:
if self.tokenizer.eos_token is not None:
stop = [self.tokenizer.eos_token]
else:
stop = []
system_prompt = None

SAMPLING_PARAM = SamplingParams(
max_tokens=max_tokens,
stop=stop,
)
if system_prompt and messages[0].get("role") != "system":
messages = [dict(role="system", content=system_prompt)] + messages

prompt = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)

stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM)

cursor = 0
strip_flag = True
async for request_output in stream:
text = request_output.outputs[0].text
assistant_message = text[cursor:]
if not strip_flag: # strip the leading whitespace
yield assistant_message
elif assistant_message.strip():
strip_flag = False
yield assistant_message.lstrip()
cursor = len(text)
try:
if OVERRIDE_CHAT_TEMPLATE: # community chat template
gen_config = _get_gen_config(CONSTANTS["chat_template"])
if not stop:
if gen_config["stop_str"]:
stop = [gen_config["stop_str"]]
else:
stop = []
system_prompt = gen_config["system_prompt"]
self.tokenizer.chat_template = gen_config["template"]
else:
if not stop:
if self.tokenizer.eos_token is not None:
stop = [self.tokenizer.eos_token]
else:
stop = []
system_prompt = None

SAMPLING_PARAM = SamplingParams(
max_tokens=max_tokens,
stop=stop,
)
if system_prompt and messages[0].role != "system":
messages = [dict(role="system", content=system_prompt)] + messages

prompt = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)

stream = await self.engine.add_request(
uuid.uuid4().hex, prompt, SAMPLING_PARAM
)

cursor = 0
strip_flag = True
async for request_output in stream:
text = request_output.outputs[0].text
assistant_message = text[cursor:]
if not strip_flag: # strip the leading whitespace
yield assistant_message
elif assistant_message.strip():
strip_flag = False
yield assistant_message.lstrip()
cursor = len(text)
except Exception as e:
logger.error(f"Error in chat API: {e}")
yield f"Error in chat API: {e}"


@functools.lru_cache(maxsize=1)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# gemma:2b-instruct-fp16-0f34ff9
# gemma:7b-instruct-awq-4bit-5b23

[![pypi_status](https://img.shields.io/badge/BentoML-1.2.19-informational)](https://pypi.org/project/BentoML)
[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ components:
- $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
- $ref: '#/components/schemas/ChatCompletionToolMessageParam'
- $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
- $ref: '#/components/schemas/CustomChatCompletionMessageParam'
title: Messages
type: array
min_p:
Expand Down Expand Up @@ -325,6 +326,7 @@ components:
anyOf:
- type: integer
- type: 'null'
default: 0
title: Top Logprobs
top_p:
anyOf:
Expand Down Expand Up @@ -396,6 +398,7 @@ components:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
Expand Down Expand Up @@ -657,6 +660,39 @@ components:
- prompt
title: CompletionRequest
type: object
CustomChatCompletionContentPartParam:
additionalProperties: true
properties:
type:
title: Type
type: string
required:
- type
title: CustomChatCompletionContentPartParam
type: object
CustomChatCompletionMessageParam:
description: Enables custom roles in the Chat Completion API.
properties:
content:
anyOf:
- type: string
- items:
anyOf:
- $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
- $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
type: array
title: Content
name:
title: Name
type: string
role:
title: Role
type: string
required:
- role
title: CustomChatCompletionMessageParam
type: object
Function:
properties:
arguments:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
service: service:VLLM
name: gemma
version: 7b-instruct-awq-4bit-5dd6145
version: 7b-instruct-awq-4bit-5b23
bentoml_version: 1.2.19
creation_time: '2024-07-02T14:30:04.464984+00:00'
creation_time: '2024-07-03T09:43:50.470130+00:00'
labels:
openllm_alias: 7b-4bit,7b-instruct-4bit
openllm_hf_model_id: casperhansen/gemma-7b-it-awq
owner: bentoml-team
platforms: linux
service_home: /chat
source_directory: vllm-chat
source_repo: https://github.com/bentoml/openllm-repo-recipe.git
source: https://github.com/bentoml/openllm-repo-recipe/tree/main/vllm-chat
models: []
runners: []
entry_service: gemma
Expand Down Expand Up @@ -118,7 +116,7 @@ schema:
apis: []
docker:
distro: debian
python_version: '3.11'
python_version: '3.9'
cuda_version: null
env:
HF_TOKEN: ''
Expand Down
Loading

0 comments on commit c8bc7c3

Please sign in to comment.