diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.lock.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bentofile.yaml b/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bentofile.yaml deleted file mode 100644 index ffc70001..00000000 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: google/gemma-2b-it - openllm_alias: 2b,2b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/README.md b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/README.md similarity index 91% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/README.md rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/README.md index b3e79471..9eef83dc 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/README.md +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/README.md @@ -1,6 +1,6 @@ -# gemma:2b-instruct-fp16-f020 +# gemma:2b-instruct-fp16-f6ee -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/apis/openapi.yaml b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/apis/openapi.yaml rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/apis/openapi.yaml index 5c138cff..bff9e429 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/apis/openapi.yaml +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/apis/schema.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/apis/schema.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/apis/schema.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/apis/schema.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/bento.yaml b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/bento.yaml similarity index 96% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/bento.yaml rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/bento.yaml index b84da906..105c4632 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/bento.yaml +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: gemma -version: 2b-instruct-fp16-f020 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:28.675802+00:00' +version: 2b-instruct-fp16-f6ee +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:00.894615+00:00' labels: model_name: google/gemma-2b-it openllm_alias: 2b,2b-instruct diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/docker/Dockerfile b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/docker/Dockerfile rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/docker/Dockerfile index 6e67ca1c..9a323f70 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/docker/Dockerfile +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/docker/entrypoint.sh b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/env/docker/entrypoint.sh rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/docker/entrypoint.sh diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/install.sh b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/install.sh similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/install.sh rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/install.sh index 607ee052..53ba63cd 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/install.sh +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/install.sh @@ -20,7 +20,7 @@ PIP_ARGS=() REQUIREMENTS_TXT="$BASEDIR/requirements.txt" REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} # Install python packages, prefer installing the requirements.lock.txt file if it exist pushd "$BASEDIR" &>/dev/null if [ -f "$REQUIREMENTS_LOCK" ]; then diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.lock.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.lock.txt similarity index 90% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.lock.txt rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.lock.txt index cbdefc41..92396f10 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.lock.txt +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.lock.txt @@ -1,5 +1,5 @@ -aiohappyeyeballs==2.3.4 -aiohttp==3.10.0 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 aiosqlite==0.20.0 annotated-types==0.7.0 @@ -7,8 +7,8 @@ anyio==4.4.0 appdirs==1.4.4 asgiref==3.8.1 async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 +attrs==24.2.0 +bentoml==1.3.1 cattrs==23.1.2 certifi==2024.7.4 charset-normalizer==3.3.2 @@ -16,7 +16,7 @@ circus==0.18.0 click==8.1.7 click-option-group==0.5.6 cloudpickle==3.0.0 -cmake==3.30.1 +cmake==3.30.2 datasets==2.14.4 deepmerge==1.1.1 deprecated==1.2.14 @@ -27,7 +27,7 @@ dnspython==2.6.1 email-validator==2.2.0 exceptiongroup==1.2.2 fastapi==0.111.0 -fastapi-cli==0.0.4 +fastapi-cli==0.0.5 filelock==3.15.4 frozenlist==1.4.1 fs==2.4.16 @@ -44,6 +44,7 @@ inflection==0.5.1 inquirerpy==0.3.4 interegular==0.3.3 jinja2==3.1.4 +jiter==0.5.0 jsonschema==4.23.0 jsonschema-specifications==2023.12.1 lark==1.1.9 @@ -74,7 +75,7 @@ nvidia-ml-py==11.525.150 nvidia-nccl-cu12==2.20.5 nvidia-nvjitlink-cu12==12.6.20 nvidia-nvtx-cu12==12.1.105 -openai==1.37.1 +openai==1.40.3 opentelemetry-api==1.20.0 opentelemetry-instrumentation==0.41b0 opentelemetry-instrumentation-aiohttp-client==0.41b0 @@ -82,7 +83,7 @@ opentelemetry-instrumentation-asgi==0.41b0 opentelemetry-sdk==1.20.0 opentelemetry-semantic-conventions==0.41b0 opentelemetry-util-http==0.41b0 -orjson==3.10.6 +orjson==3.10.7 outlines==0.0.46 packaging==24.1 pandas==2.2.2 @@ -108,15 +109,15 @@ python-dotenv==1.0.1 python-json-logger==2.0.7 python-multipart==0.0.9 pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 +pyyaml==6.0.2 +pyzmq==26.1.0 ray==2.34.0 referencing==0.35.1 regex==2024.7.24 requests==2.32.3 rich==13.7.1 -rpds-py==0.19.1 -safetensors==0.4.3 +rpds-py==0.20.0 +safetensors==0.4.4 schema==0.7.7 sentencepiece==0.2.0 setuptools==72.1.0 @@ -125,7 +126,7 @@ simple-di==0.1.5 six==1.16.0 sniffio==1.3.1 starlette==0.37.2 -sympy==1.13.1 +sympy==1.13.2 tiktoken==0.7.0 tokenizers==0.19.1 tomli==2.0.1 @@ -133,7 +134,7 @@ tomli-w==1.0.0 torch==2.3.1 torchvision==0.18.1 tornado==6.4.1 -tqdm==4.66.4 +tqdm==4.66.5 transformers==4.43.1 triton==2.3.1 typer==0.12.3 @@ -141,12 +142,12 @@ typing-extensions==4.12.2 tzdata==2024.1 ujson==5.10.0 urllib3==2.2.2 -uv==0.2.32 -uvicorn==0.30.4 +uv==0.2.35 +uvicorn==0.30.5 uvloop==0.19.0 vllm==0.5.3.post1 vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 +watchfiles==0.23.0 wcwidth==0.2.13 websockets==12.0 wrapt==1.16.0 @@ -154,4 +155,4 @@ wsproto==1.2.0 xformers==0.0.27 xxhash==3.4.1 yarl==1.9.4 -zipp==3.19.2 +zipp==3.20.0 diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.txt similarity index 85% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.txt rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.txt index dc141d0d..84ab2103 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/requirements.txt +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/version.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/version.txt similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/version.txt rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/env/python/version.txt diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/bentofile.yaml b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/bentofile.yaml new file mode 100644 index 00000000..8b7808ab --- /dev/null +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: google/gemma-2b-it + openllm_alias: 2b,2b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bento_constants.py b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/openllm_config.yaml similarity index 91% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bento_constants.py rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/openllm_config.yaml index 0414c91d..8016c3c2 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/bento_constants.py +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/service.py b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/service.py similarity index 97% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/service.py rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/service.py index a6f4b728..8fd0ca16 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/service.py +++ b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/service.py @@ -13,7 +13,6 @@ import vllm.entrypoints.openai.api_server as vllm_api_server import yaml from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML from fastapi.responses import FileResponse from typing_extensions import Annotated, Literal @@ -22,8 +21,10 @@ class Message(pydantic.BaseModel): content: str role: Literal["system", "user", "assistant"] - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/404.html b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/404.html similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/404.html rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/404.html diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/apple-touch-icon.png b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/apple-touch-icon.png rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/chat.html b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/chat.html similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/chat.html rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/chat.html diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/chat.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/chat.txt rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/chat.txt diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/favicon-16x16.png b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/favicon-16x16.png rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/favicon.ico b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/favicon.ico rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/favicon.ico diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/index.html b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/index.html similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/index.html rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/index.html diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/index.txt b/bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/index.txt similarity index 100% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/src/ui/index.txt rename to bentoml/bentos/gemma/2b-instruct-fp16-f6ee/src/ui/index.txt diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.lock.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/README.md b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/README.md similarity index 91% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/README.md rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/README.md index eebb3fe9..b422f92b 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/README.md +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/README.md @@ -1,6 +1,6 @@ -# gemma:7b-instruct-awq-4bit-2eed +# gemma:7b-instruct-awq-4bit-bdb5 -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/apis/openapi.yaml b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/apis/openapi.yaml rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/apis/openapi.yaml index 7db54265..61a1f796 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/apis/openapi.yaml +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/apis/schema.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/apis/schema.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/apis/schema.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/apis/schema.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/bento.yaml b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/bento.yaml similarity index 96% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/bento.yaml rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/bento.yaml index 3aa596a2..7115fa56 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/bento.yaml +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: gemma -version: 7b-instruct-awq-4bit-2eed -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:43.100369+00:00' +version: 7b-instruct-awq-4bit-bdb5 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:15.804156+00:00' labels: model_name: casperhansen/gemma-7b-it-awq openllm_alias: 7b-4bit,7b-instruct-4bit diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/docker/Dockerfile b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/env/docker/Dockerfile rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/docker/Dockerfile index 6e67ca1c..9a323f70 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/docker/Dockerfile +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/docker/entrypoint.sh b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/docker/entrypoint.sh rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/docker/entrypoint.sh diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/install.sh b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/install.sh similarity index 97% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/install.sh rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/install.sh index 607ee052..53ba63cd 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/python/install.sh +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/install.sh @@ -20,7 +20,7 @@ PIP_ARGS=() REQUIREMENTS_TXT="$BASEDIR/requirements.txt" REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} # Install python packages, prefer installing the requirements.lock.txt file if it exist pushd "$BASEDIR" &>/dev/null if [ -f "$REQUIREMENTS_LOCK" ]; then diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.lock.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.lock.txt similarity index 90% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.lock.txt rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.lock.txt index cbdefc41..92396f10 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.lock.txt +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.lock.txt @@ -1,5 +1,5 @@ -aiohappyeyeballs==2.3.4 -aiohttp==3.10.0 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 aiosqlite==0.20.0 annotated-types==0.7.0 @@ -7,8 +7,8 @@ anyio==4.4.0 appdirs==1.4.4 asgiref==3.8.1 async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 +attrs==24.2.0 +bentoml==1.3.1 cattrs==23.1.2 certifi==2024.7.4 charset-normalizer==3.3.2 @@ -16,7 +16,7 @@ circus==0.18.0 click==8.1.7 click-option-group==0.5.6 cloudpickle==3.0.0 -cmake==3.30.1 +cmake==3.30.2 datasets==2.14.4 deepmerge==1.1.1 deprecated==1.2.14 @@ -27,7 +27,7 @@ dnspython==2.6.1 email-validator==2.2.0 exceptiongroup==1.2.2 fastapi==0.111.0 -fastapi-cli==0.0.4 +fastapi-cli==0.0.5 filelock==3.15.4 frozenlist==1.4.1 fs==2.4.16 @@ -44,6 +44,7 @@ inflection==0.5.1 inquirerpy==0.3.4 interegular==0.3.3 jinja2==3.1.4 +jiter==0.5.0 jsonschema==4.23.0 jsonschema-specifications==2023.12.1 lark==1.1.9 @@ -74,7 +75,7 @@ nvidia-ml-py==11.525.150 nvidia-nccl-cu12==2.20.5 nvidia-nvjitlink-cu12==12.6.20 nvidia-nvtx-cu12==12.1.105 -openai==1.37.1 +openai==1.40.3 opentelemetry-api==1.20.0 opentelemetry-instrumentation==0.41b0 opentelemetry-instrumentation-aiohttp-client==0.41b0 @@ -82,7 +83,7 @@ opentelemetry-instrumentation-asgi==0.41b0 opentelemetry-sdk==1.20.0 opentelemetry-semantic-conventions==0.41b0 opentelemetry-util-http==0.41b0 -orjson==3.10.6 +orjson==3.10.7 outlines==0.0.46 packaging==24.1 pandas==2.2.2 @@ -108,15 +109,15 @@ python-dotenv==1.0.1 python-json-logger==2.0.7 python-multipart==0.0.9 pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 +pyyaml==6.0.2 +pyzmq==26.1.0 ray==2.34.0 referencing==0.35.1 regex==2024.7.24 requests==2.32.3 rich==13.7.1 -rpds-py==0.19.1 -safetensors==0.4.3 +rpds-py==0.20.0 +safetensors==0.4.4 schema==0.7.7 sentencepiece==0.2.0 setuptools==72.1.0 @@ -125,7 +126,7 @@ simple-di==0.1.5 six==1.16.0 sniffio==1.3.1 starlette==0.37.2 -sympy==1.13.1 +sympy==1.13.2 tiktoken==0.7.0 tokenizers==0.19.1 tomli==2.0.1 @@ -133,7 +134,7 @@ tomli-w==1.0.0 torch==2.3.1 torchvision==0.18.1 tornado==6.4.1 -tqdm==4.66.4 +tqdm==4.66.5 transformers==4.43.1 triton==2.3.1 typer==0.12.3 @@ -141,12 +142,12 @@ typing-extensions==4.12.2 tzdata==2024.1 ujson==5.10.0 urllib3==2.2.2 -uv==0.2.32 -uvicorn==0.30.4 +uv==0.2.35 +uvicorn==0.30.5 uvloop==0.19.0 vllm==0.5.3.post1 vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 +watchfiles==0.23.0 wcwidth==0.2.13 websockets==12.0 wrapt==1.16.0 @@ -154,4 +155,4 @@ wsproto==1.2.0 xformers==0.0.27 xxhash==3.4.1 yarl==1.9.4 -zipp==3.19.2 +zipp==3.20.0 diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.txt similarity index 85% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.txt rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.txt index dc141d0d..84ab2103 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/requirements.txt +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/version.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/version.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/version.txt rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/env/python/version.txt diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bentofile.yaml b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bentofile.yaml rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/bentofile.yaml index b5c19050..25f8d1fa 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bentofile.yaml +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 7b-4bit,7b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bento_constants.py b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bento_constants.py rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/openllm_config.yaml index c379bb9c..43e76143 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/bento_constants.py +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: gemma-it engine_config: max_model_len: 2048 @@ -16,5 +14,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/service.py b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/service.py similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/service.py rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/service.py index a6f4b728..8fd0ca16 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/service.py +++ b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/service.py @@ -13,7 +13,6 @@ import vllm.entrypoints.openai.api_server as vllm_api_server import yaml from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML from fastapi.responses import FileResponse from typing_extensions import Annotated, Literal @@ -22,8 +21,10 @@ class Message(pydantic.BaseModel): content: str role: Literal["system", "user", "assistant"] - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/404.html b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/404.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/404.html rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/404.html diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/apple-touch-icon.png b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/apple-touch-icon.png rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/chat.html b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/chat.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/chat.html rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/chat.html diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/chat.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/chat.txt rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/chat.txt diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/favicon-16x16.png b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/favicon-16x16.png rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/favicon.ico b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/favicon.ico rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/favicon.ico diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/index.html b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/index.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/index.html rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/index.html diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/index.txt b/bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/index.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/src/ui/index.txt rename to bentoml/bentos/gemma/7b-instruct-awq-4bit-bdb5/src/ui/index.txt diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bentofile.yaml b/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bentofile.yaml deleted file mode 100644 index 188e7a36..00000000 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: google/gemma-7b-it - openllm_alias: 7b,7b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/README.md b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/README.md similarity index 91% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/README.md rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/README.md index 8b3798de..72965430 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/README.md +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/README.md @@ -1,6 +1,6 @@ -# gemma:7b-instruct-fp16-1e96 +# gemma:7b-instruct-fp16-35e0 -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/apis/openapi.yaml b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/apis/openapi.yaml rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/apis/openapi.yaml index 3153a312..7f2a285b 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/apis/openapi.yaml +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# gemma:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/apis/schema.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/apis/schema.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/apis/schema.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/apis/schema.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/bento.yaml b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/bento.yaml similarity index 96% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/bento.yaml rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/bento.yaml index 7bdcfdc1..c2cce8db 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/bento.yaml +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: gemma -version: 7b-instruct-fp16-1e96 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:35.937782+00:00' +version: 7b-instruct-fp16-35e0 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:08.363056+00:00' labels: model_name: google/gemma-7b-it openllm_alias: 7b,7b-instruct diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/docker/Dockerfile b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/docker/Dockerfile rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/docker/Dockerfile index 6e67ca1c..9a323f70 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/docker/Dockerfile +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/docker/entrypoint.sh b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/docker/entrypoint.sh rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/docker/entrypoint.sh diff --git a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/install.sh b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/install.sh similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/install.sh rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/install.sh index 607ee052..53ba63cd 100644 --- a/bentoml/bentos/gemma/7b-instruct-awq-4bit-2eed/env/python/install.sh +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/install.sh @@ -20,7 +20,7 @@ PIP_ARGS=() REQUIREMENTS_TXT="$BASEDIR/requirements.txt" REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} # Install python packages, prefer installing the requirements.lock.txt file if it exist pushd "$BASEDIR" &>/dev/null if [ -f "$REQUIREMENTS_LOCK" ]; then diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.lock.txt b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.lock.txt similarity index 86% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.lock.txt rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.lock.txt index 0684a350..92396f10 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.lock.txt +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.lock.txt @@ -1,4 +1,5 @@ -aiohttp==3.9.5 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 aiosqlite==0.20.0 annotated-types==0.7.0 @@ -6,8 +7,8 @@ anyio==4.4.0 appdirs==1.4.4 asgiref==3.8.1 async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 +attrs==24.2.0 +bentoml==1.3.1 cattrs==23.1.2 certifi==2024.7.4 charset-normalizer==3.3.2 @@ -15,7 +16,7 @@ circus==0.18.0 click==8.1.7 click-option-group==0.5.6 cloudpickle==3.0.0 -cmake==3.30.1 +cmake==3.30.2 datasets==2.14.4 deepmerge==1.1.1 deprecated==1.2.14 @@ -26,7 +27,7 @@ dnspython==2.6.1 email-validator==2.2.0 exceptiongroup==1.2.2 fastapi==0.111.0 -fastapi-cli==0.0.4 +fastapi-cli==0.0.5 filelock==3.15.4 frozenlist==1.4.1 fs==2.4.16 @@ -36,13 +37,14 @@ httpcore==1.0.5 httptools==0.6.1 httpx==0.27.0 httpx-ws==0.6.0 -huggingface-hub==0.24.1 +huggingface-hub==0.24.5 idna==3.7 importlib-metadata==6.11.0 inflection==0.5.1 inquirerpy==0.3.4 interegular==0.3.3 jinja2==3.1.4 +jiter==0.5.0 jsonschema==4.23.0 jsonschema-specifications==2023.12.1 lark==1.1.9 @@ -71,9 +73,9 @@ nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 nvidia-ml-py==11.525.150 nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.20 nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 +openai==1.40.3 opentelemetry-api==1.20.0 opentelemetry-instrumentation==0.41b0 opentelemetry-instrumentation-aiohttp-client==0.41b0 @@ -81,7 +83,7 @@ opentelemetry-instrumentation-asgi==0.41b0 opentelemetry-sdk==1.20.0 opentelemetry-semantic-conventions==0.41b0 opentelemetry-util-http==0.41b0 -orjson==3.10.6 +orjson==3.10.7 outlines==0.0.46 packaging==24.1 pandas==2.2.2 @@ -92,7 +94,7 @@ pip-requirements-parser==32.0.1 prometheus-client==0.20.0 prometheus-fastapi-instrumentator==7.0.0 prompt-toolkit==3.0.47 -protobuf==5.27.2 +protobuf==5.27.3 psutil==6.0.0 py-cpuinfo==9.0.0 pyairports==2.1.1 @@ -107,24 +109,24 @@ python-dotenv==1.0.1 python-json-logger==2.0.7 python-multipart==0.0.9 pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 referencing==0.35.1 -regex==2024.5.15 +regex==2024.7.24 requests==2.32.3 rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 +rpds-py==0.20.0 +safetensors==0.4.4 schema==0.7.7 sentencepiece==0.2.0 -setuptools==71.1.0 +setuptools==72.1.0 shellingham==1.5.4 simple-di==0.1.5 six==1.16.0 sniffio==1.3.1 starlette==0.37.2 -sympy==1.13.1 +sympy==1.13.2 tiktoken==0.7.0 tokenizers==0.19.1 tomli==2.0.1 @@ -132,7 +134,7 @@ tomli-w==1.0.0 torch==2.3.1 torchvision==0.18.1 tornado==6.4.1 -tqdm==4.66.4 +tqdm==4.66.5 transformers==4.43.1 triton==2.3.1 typer==0.12.3 @@ -140,12 +142,12 @@ typing-extensions==4.12.2 tzdata==2024.1 ujson==5.10.0 urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 +uv==0.2.35 +uvicorn==0.30.5 uvloop==0.19.0 vllm==0.5.3.post1 vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 +watchfiles==0.23.0 wcwidth==0.2.13 websockets==12.0 wrapt==1.16.0 @@ -153,4 +155,4 @@ wsproto==1.2.0 xformers==0.0.27 xxhash==3.4.1 yarl==1.9.4 -zipp==3.19.2 +zipp==3.20.0 diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.txt b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.txt similarity index 85% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.txt rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.txt index dc141d0d..84ab2103 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/requirements.txt +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/version.txt b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/version.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/env/python/version.txt rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/env/python/version.txt diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/bentofile.yaml b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/bentofile.yaml new file mode 100644 index 00000000..c661db02 --- /dev/null +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: google/gemma-7b-it + openllm_alias: 7b,7b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bento_constants.py b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/openllm_config.yaml similarity index 91% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bento_constants.py rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/openllm_config.yaml index 18062490..83ac4b15 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/bento_constants.py +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/service.py b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/service.py similarity index 97% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/service.py rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/service.py index a6f4b728..8fd0ca16 100644 --- a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/service.py +++ b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/service.py @@ -13,7 +13,6 @@ import vllm.entrypoints.openai.api_server as vllm_api_server import yaml from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML from fastapi.responses import FileResponse from typing_extensions import Annotated, Literal @@ -22,8 +21,10 @@ class Message(pydantic.BaseModel): content: str role: Literal["system", "user", "assistant"] - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/404.html b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/404.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/404.html rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/404.html diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/apple-touch-icon.png b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/apple-touch-icon.png rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/chat.html b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/chat.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/chat.html rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/chat.html diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/chat.txt b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/chat.txt rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/chat.txt diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/favicon-16x16.png b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/favicon-16x16.png rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/favicon.ico b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/favicon.ico rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/favicon.ico diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/index.html b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/index.html similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/index.html rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/index.html diff --git a/bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/index.txt b/bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/index.txt similarity index 100% rename from bentoml/bentos/gemma/7b-instruct-fp16-1e96/src/ui/index.txt rename to bentoml/bentos/gemma/7b-instruct-fp16-35e0/src/ui/index.txt diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/README.md b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/README.md similarity index 91% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/README.md rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/README.md index 31efa37a..9b827a1f 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/README.md +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/README.md @@ -1,6 +1,6 @@ -# gemma2:27b-instruct-fp16-9fff +# gemma2:27b-instruct-fp16-9799 -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/apis/openapi.yaml b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/apis/openapi.yaml rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/apis/openapi.yaml index 0925fbef..7b00ed6a 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/apis/openapi.yaml +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# gemma2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# gemma2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/apis/schema.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/apis/schema.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/apis/schema.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/apis/schema.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/bento.yaml b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/bento.yaml similarity index 96% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/bento.yaml rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/bento.yaml index e4a40dcb..5a26ebe5 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/bento.yaml +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: gemma2 -version: 27b-instruct-fp16-9fff -bentoml_version: 1.3.0 -creation_time: '2024-07-25T22:48:10.530027+00:00' +version: 27b-instruct-fp16-9799 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:42:53.423488+00:00' labels: model_name: google/gemma-2-27b-it openllm_alias: 27b,27b-instruct diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/docker/Dockerfile b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/docker/Dockerfile rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/docker/Dockerfile index 4f724698..b100b059 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/docker/Dockerfile +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/docker/entrypoint.sh b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/docker/entrypoint.sh rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/docker/entrypoint.sh diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/install.sh b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/install.sh similarity index 97% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/install.sh rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/install.sh index 607ee052..53ba63cd 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/install.sh +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/install.sh @@ -20,7 +20,7 @@ PIP_ARGS=() REQUIREMENTS_TXT="$BASEDIR/requirements.txt" REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} # Install python packages, prefer installing the requirements.lock.txt file if it exist pushd "$BASEDIR" &>/dev/null if [ -f "$REQUIREMENTS_LOCK" ]; then diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.lock.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.lock.txt new file mode 100644 index 00000000..1c76af64 --- /dev/null +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.lock.txt @@ -0,0 +1,159 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +flashinfer==0.1.4+cu121torch2.3 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.txt similarity index 91% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.txt rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.txt index cc1fa2a6..e097761e 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.txt +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/version.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/version.txt similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/version.txt rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/env/python/version.txt diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bentofile.yaml b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bentofile.yaml rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/bentofile.yaml index ee9a5c09..529118f8 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bentofile.yaml +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/bentofile.yaml @@ -1,27 +1,10 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - VLLM_ATTENTION_BACKEND: FLASHINFER - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN - name: VLLM_ATTENTION_BACKEND value: FLASHINFER -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -35,18 +18,7 @@ labels: openllm_alias: 27b,27b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bento_constants.py b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/openllm_config.yaml similarity index 94% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bento_constants.py rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/openllm_config.yaml index 0be189a7..dce0c454 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/bento_constants.py +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -21,5 +19,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/service.py b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/service.py similarity index 97% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/service.py rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/service.py index a6f4b728..8fd0ca16 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/service.py +++ b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/service.py @@ -13,7 +13,6 @@ import vllm.entrypoints.openai.api_server as vllm_api_server import yaml from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML from fastapi.responses import FileResponse from typing_extensions import Annotated, Literal @@ -22,8 +21,10 @@ class Message(pydantic.BaseModel): content: str role: Literal["system", "user", "assistant"] - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/404.html b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/404.html similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/404.html rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/404.html diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/apple-touch-icon.png b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/apple-touch-icon.png rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/chat.html b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/chat.html similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/chat.html rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/chat.html diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/chat.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/chat.txt rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/chat.txt diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/favicon-16x16.png b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/favicon-16x16.png rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/favicon.ico b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/favicon.ico rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/favicon.ico diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/index.html b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/index.html similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/index.html rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/index.html diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/index.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/index.txt similarity index 100% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/src/ui/index.txt rename to bentoml/bentos/gemma2/27b-instruct-fp16-9799/src/ui/index.txt diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.lock.txt b/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.lock.txt deleted file mode 100644 index 75d5fb4b..00000000 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.lock.txt +++ /dev/null @@ -1,158 +0,0 @@ ---extra-index-url https://flashinfer.ai/whl/cu121/torch2.3 -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -flashinfer==0.1.1+cu121torch2.3 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.2 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.33.0 -referencing==0.35.1 -regex==2024.7.24 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.1 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.29 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/README.md b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/README.md similarity index 91% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/README.md rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/README.md index 6d992703..d5f7666c 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/README.md +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/README.md @@ -1,6 +1,6 @@ -# gemma2:9b-instruct-fp16-dce1 +# gemma2:9b-instruct-fp16-cb2b -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/apis/openapi.yaml b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/apis/openapi.yaml rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/apis/openapi.yaml index 8a3321e6..03bcb816 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/apis/openapi.yaml +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# gemma2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# gemma2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/apis/schema.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/apis/schema.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/apis/schema.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/apis/schema.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/bento.yaml b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/bento.yaml similarity index 96% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/bento.yaml rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/bento.yaml index 5ae9efa3..36e7103a 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/bento.yaml +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: gemma2 -version: 9b-instruct-fp16-dce1 -bentoml_version: 1.3.0 -creation_time: '2024-07-25T22:48:03.450728+00:00' +version: 9b-instruct-fp16-cb2b +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:42:45.571311+00:00' labels: model_name: google/gemma-2-9b-it openllm_alias: 9b,9b-instruct diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/docker/Dockerfile b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/docker/Dockerfile rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/docker/Dockerfile index 4f724698..b100b059 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/docker/Dockerfile +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/docker/entrypoint.sh b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/docker/entrypoint.sh rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/docker/entrypoint.sh diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/install.sh b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.lock.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.lock.txt new file mode 100644 index 00000000..1c76af64 --- /dev/null +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.lock.txt @@ -0,0 +1,159 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +flashinfer==0.1.4+cu121torch2.3 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.txt similarity index 91% rename from bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.txt rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.txt index cc1fa2a6..e097761e 100644 --- a/bentoml/bentos/gemma2/27b-instruct-fp16-9fff/env/python/requirements.txt +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/version.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/version.txt similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/version.txt rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/env/python/version.txt diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/bentofile.yaml b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/bentofile.yaml new file mode 100644 index 00000000..2bf1bc6a --- /dev/null +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/bentofile.yaml @@ -0,0 +1,24 @@ +envs: +- name: HF_TOKEN +- name: VLLM_ATTENTION_BACKEND + value: FLASHINFER +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: google/gemma-2-9b-it + openllm_alias: 9b,9b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bento_constants.py b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/openllm_config.yaml similarity index 94% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bento_constants.py rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/openllm_config.yaml index 4f9c67fb..1eb93549 100644 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bento_constants.py +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -21,5 +19,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/service.py b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/404.html b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/404.html similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/404.html rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/404.html diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/apple-touch-icon.png b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/apple-touch-icon.png rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/chat.html b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/chat.html similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/chat.html rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/chat.html diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/chat.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/chat.txt rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/chat.txt diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/favicon-16x16.png b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/favicon-16x16.png rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/favicon.ico b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/favicon.ico rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/favicon.ico diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/index.html b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/index.html similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/index.html rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/index.html diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/index.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/index.txt similarity index 100% rename from bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/ui/index.txt rename to bentoml/bentos/gemma2/9b-instruct-fp16-cb2b/src/ui/index.txt diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/install.sh b/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.lock.txt b/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.lock.txt deleted file mode 100644 index 75d5fb4b..00000000 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/env/python/requirements.lock.txt +++ /dev/null @@ -1,158 +0,0 @@ ---extra-index-url https://flashinfer.ai/whl/cu121/torch2.3 -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -flashinfer==0.1.1+cu121torch2.3 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.2 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.33.0 -referencing==0.35.1 -regex==2024.7.24 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.1 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.29 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bentofile.yaml b/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bentofile.yaml deleted file mode 100644 index b1351b0e..00000000 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/bentofile.yaml +++ /dev/null @@ -1,52 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - VLLM_ATTENTION_BACKEND: FLASHINFER - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -- name: VLLM_ATTENTION_BACKEND - value: FLASHINFER -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: google/gemma-2-9b-it - openllm_alias: 9b,9b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/service.py b/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/gemma2/9b-instruct-fp16-dce1/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/install.sh b/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/bentofile.yaml b/bentoml/bentos/llama2/13b-chat-fp16-603a/src/bentofile.yaml deleted file mode 100644 index 8068f0e4..00000000 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: meta-llama/Llama-2-13b-chat-hf - openllm_alias: 13b,13b-chat - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/service.py b/bentoml/bentos/llama2/13b-chat-fp16-603a/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/README.md b/bentoml/bentos/llama2/13b-chat-fp16-a846/README.md similarity index 91% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/README.md rename to bentoml/bentos/llama2/13b-chat-fp16-a846/README.md index bf9f719b..f6ea5b6f 100644 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/README.md +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/README.md @@ -1,6 +1,6 @@ -# llama2:70b-chat-fp16-11af +# llama2:13b-chat-fp16-a846 -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/apis/openapi.yaml b/bentoml/bentos/llama2/13b-chat-fp16-a846/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/apis/openapi.yaml rename to bentoml/bentos/llama2/13b-chat-fp16-a846/apis/openapi.yaml index 24d2612d..c43c9cad 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/apis/openapi.yaml +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/apis/schema.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/apis/schema.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/apis/schema.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/apis/schema.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/bento.yaml b/bentoml/bentos/llama2/13b-chat-fp16-a846/bento.yaml similarity index 96% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/bento.yaml rename to bentoml/bentos/llama2/13b-chat-fp16-a846/bento.yaml index 438bd94e..4fed98ff 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/bento.yaml +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama2 -version: 13b-chat-fp16-603a -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:17:48.252205+00:00' +version: 13b-chat-fp16-a846 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:21.800939+00:00' labels: model_name: meta-llama/Llama-2-13b-chat-hf openllm_alias: 13b,13b-chat diff --git a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/docker/Dockerfile b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/gemma/2b-instruct-fp16-f020/env/docker/Dockerfile rename to bentoml/bentos/llama2/13b-chat-fp16-a846/env/docker/Dockerfile index 6e67ca1c..9a323f70 100644 --- a/bentoml/bentos/gemma/2b-instruct-fp16-f020/env/docker/Dockerfile +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG HF_TOKEN= ENV HF_TOKEN=$HF_TOKEN diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/docker/entrypoint.sh b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/env/docker/entrypoint.sh rename to bentoml/bentos/llama2/13b-chat-fp16-a846/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/install.sh b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.lock.txt b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.lock.txt similarity index 86% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.lock.txt rename to bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.lock.txt index 0684a350..92396f10 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.lock.txt +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.lock.txt @@ -1,4 +1,5 @@ -aiohttp==3.9.5 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 aiosqlite==0.20.0 annotated-types==0.7.0 @@ -6,8 +7,8 @@ anyio==4.4.0 appdirs==1.4.4 asgiref==3.8.1 async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 +attrs==24.2.0 +bentoml==1.3.1 cattrs==23.1.2 certifi==2024.7.4 charset-normalizer==3.3.2 @@ -15,7 +16,7 @@ circus==0.18.0 click==8.1.7 click-option-group==0.5.6 cloudpickle==3.0.0 -cmake==3.30.1 +cmake==3.30.2 datasets==2.14.4 deepmerge==1.1.1 deprecated==1.2.14 @@ -26,7 +27,7 @@ dnspython==2.6.1 email-validator==2.2.0 exceptiongroup==1.2.2 fastapi==0.111.0 -fastapi-cli==0.0.4 +fastapi-cli==0.0.5 filelock==3.15.4 frozenlist==1.4.1 fs==2.4.16 @@ -36,13 +37,14 @@ httpcore==1.0.5 httptools==0.6.1 httpx==0.27.0 httpx-ws==0.6.0 -huggingface-hub==0.24.1 +huggingface-hub==0.24.5 idna==3.7 importlib-metadata==6.11.0 inflection==0.5.1 inquirerpy==0.3.4 interegular==0.3.3 jinja2==3.1.4 +jiter==0.5.0 jsonschema==4.23.0 jsonschema-specifications==2023.12.1 lark==1.1.9 @@ -71,9 +73,9 @@ nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 nvidia-ml-py==11.525.150 nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.20 nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 +openai==1.40.3 opentelemetry-api==1.20.0 opentelemetry-instrumentation==0.41b0 opentelemetry-instrumentation-aiohttp-client==0.41b0 @@ -81,7 +83,7 @@ opentelemetry-instrumentation-asgi==0.41b0 opentelemetry-sdk==1.20.0 opentelemetry-semantic-conventions==0.41b0 opentelemetry-util-http==0.41b0 -orjson==3.10.6 +orjson==3.10.7 outlines==0.0.46 packaging==24.1 pandas==2.2.2 @@ -92,7 +94,7 @@ pip-requirements-parser==32.0.1 prometheus-client==0.20.0 prometheus-fastapi-instrumentator==7.0.0 prompt-toolkit==3.0.47 -protobuf==5.27.2 +protobuf==5.27.3 psutil==6.0.0 py-cpuinfo==9.0.0 pyairports==2.1.1 @@ -107,24 +109,24 @@ python-dotenv==1.0.1 python-json-logger==2.0.7 python-multipart==0.0.9 pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 referencing==0.35.1 -regex==2024.5.15 +regex==2024.7.24 requests==2.32.3 rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 +rpds-py==0.20.0 +safetensors==0.4.4 schema==0.7.7 sentencepiece==0.2.0 -setuptools==71.1.0 +setuptools==72.1.0 shellingham==1.5.4 simple-di==0.1.5 six==1.16.0 sniffio==1.3.1 starlette==0.37.2 -sympy==1.13.1 +sympy==1.13.2 tiktoken==0.7.0 tokenizers==0.19.1 tomli==2.0.1 @@ -132,7 +134,7 @@ tomli-w==1.0.0 torch==2.3.1 torchvision==0.18.1 tornado==6.4.1 -tqdm==4.66.4 +tqdm==4.66.5 transformers==4.43.1 triton==2.3.1 typer==0.12.3 @@ -140,12 +142,12 @@ typing-extensions==4.12.2 tzdata==2024.1 ujson==5.10.0 urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 +uv==0.2.35 +uvicorn==0.30.5 uvloop==0.19.0 vllm==0.5.3.post1 vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 +watchfiles==0.23.0 wcwidth==0.2.13 websockets==12.0 wrapt==1.16.0 @@ -153,4 +155,4 @@ wsproto==1.2.0 xformers==0.0.27 xxhash==3.4.1 yarl==1.9.4 -zipp==3.19.2 +zipp==3.20.0 diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.txt b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.txt similarity index 85% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.txt rename to bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.txt index dc141d0d..84ab2103 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/requirements.txt +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 torch==2.3.1 vllm==0.5.3.post1 numpy==1.26.0 diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/version.txt b/bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/env/python/version.txt rename to bentoml/bentos/llama2/13b-chat-fp16-a846/env/python/version.txt diff --git a/bentoml/bentos/llama2/13b-chat-fp16-a846/src/bentofile.yaml b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/bentofile.yaml new file mode 100644 index 00000000..c80e1719 --- /dev/null +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: meta-llama/Llama-2-13b-chat-hf + openllm_alias: 13b,13b-chat + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/bento_constants.py b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/bento_constants.py rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/openllm_config.yaml index 64bce5f7..a3772792 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/bento_constants.py +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: llama-2-chat engine_config: dtype: half @@ -16,5 +14,3 @@ gpu_type: nvidia-tesla-a100 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama2/13b-chat-fp16-a846/src/service.py b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/404.html b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/404.html rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/404.html diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/apple-touch-icon.png b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/chat.html b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/chat.html rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/chat.html diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/chat.txt b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/chat.txt rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/chat.txt diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/favicon-16x16.png b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/favicon-16x16.png rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/favicon.ico b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/favicon.ico rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/favicon.ico diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/index.html b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/index.html rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/index.html diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/index.txt b/bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/src/ui/index.txt rename to bentoml/bentos/llama2/13b-chat-fp16-a846/src/ui/index.txt diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/docker/Dockerfile b/bentoml/bentos/llama2/70b-chat-fp16-11af/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/install.sh b/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.lock.txt b/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.txt b/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/bentofile.yaml b/bentoml/bentos/llama2/70b-chat-fp16-11af/src/bentofile.yaml deleted file mode 100644 index 1a7f0bdb..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: meta-llama/Llama-2-70b-chat-hf - openllm_alias: 70b,70b-chat - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/service.py b/bentoml/bentos/llama2/70b-chat-fp16-11af/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama2/13b-chat-fp16-603a/README.md b/bentoml/bentos/llama2/70b-chat-fp16-fcef/README.md similarity index 91% rename from bentoml/bentos/llama2/13b-chat-fp16-603a/README.md rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/README.md index 0f35ecec..078de7b7 100644 --- a/bentoml/bentos/llama2/13b-chat-fp16-603a/README.md +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/README.md @@ -1,6 +1,6 @@ -# llama2:13b-chat-fp16-603a +# llama2:70b-chat-fp16-fcef -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/apis/openapi.yaml b/bentoml/bentos/llama2/70b-chat-fp16-fcef/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/apis/openapi.yaml rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/apis/openapi.yaml index 9d184888..c67860a3 100644 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/apis/openapi.yaml +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/apis/schema.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/apis/schema.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/apis/schema.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/apis/schema.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/bento.yaml b/bentoml/bentos/llama2/70b-chat-fp16-fcef/bento.yaml similarity index 96% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/bento.yaml rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/bento.yaml index 7007fa85..20b17444 100644 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/bento.yaml +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama2 -version: 70b-chat-fp16-11af -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:17:55.444721+00:00' +version: 70b-chat-fp16-fcef +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:29.257919+00:00' labels: model_name: meta-llama/Llama-2-70b-chat-hf openllm_alias: 70b,70b-chat diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/docker/Dockerfile b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/docker/entrypoint.sh b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/env/docker/entrypoint.sh rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/install.sh b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.lock.txt b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.txt b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/version.txt b/bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/env/python/version.txt rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/env/python/version.txt diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/bentofile.yaml b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/bentofile.yaml new file mode 100644 index 00000000..bc6f2047 --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: meta-llama/Llama-2-70b-chat-hf + openllm_alias: 70b,70b-chat + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/bento_constants.py b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/bento_constants.py rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/openllm_config.yaml index 80a52495..9ec98f29 100644 --- a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/bento_constants.py +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: llama-2-chat engine_config: dtype: half @@ -17,5 +15,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/service.py b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/404.html b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/404.html rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/404.html diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/apple-touch-icon.png b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/chat.html b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/chat.html rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/chat.html diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/chat.txt b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/chat.txt rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/chat.txt diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/favicon-16x16.png b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/favicon-16x16.png rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/favicon.ico b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/favicon.ico rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/favicon.ico diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/index.html b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/index.html rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/index.html diff --git a/bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/index.txt b/bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama2/70b-chat-fp16-11af/src/ui/index.txt rename to bentoml/bentos/llama2/70b-chat-fp16-fcef/src/ui/index.txt diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/README.md b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/README.md new file mode 100644 index 00000000..d44f3388 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/README.md @@ -0,0 +1,16 @@ +# llama2:7b-chat-awq-4bit-753b + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/apis/openapi.yaml b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/apis/openapi.yaml rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/apis/openapi.yaml index 599ae113..908aa81f 100644 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/apis/openapi.yaml +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/apis/schema.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/apis/schema.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/apis/schema.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/apis/schema.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/bento.yaml b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/bento.yaml similarity index 96% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/bento.yaml rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/bento.yaml index f6daf754..e94b9d34 100644 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/bento.yaml +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama2 -version: 7b-chat-awq-4bit-c733 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:02.586833+00:00' +version: 7b-chat-awq-4bit-753b +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:36.642737+00:00' labels: model_name: TheBloke/Llama-2-7B-Chat-AWQ openllm_alias: 7b-4bit,7b-chat-4bit diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/docker/Dockerfile b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/docker/entrypoint.sh b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/docker/entrypoint.sh rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/install.sh b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.lock.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/version.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/version.txt rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/env/python/version.txt diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bentofile.yaml b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bentofile.yaml rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/bentofile.yaml index ac7b7cd6..e7dcbe1b 100644 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bentofile.yaml +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 7b-4bit,7b-chat-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bento_constants.py b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bento_constants.py rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/openllm_config.yaml index ba4735cb..47e5ac20 100644 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/bento_constants.py +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: llama-2-chat engine_config: enforce_eager: true @@ -17,5 +15,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/service.py b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/404.html b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/404.html rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/404.html diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/apple-touch-icon.png b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/chat.html b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/chat.html rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/chat.html diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/chat.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/chat.txt rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/chat.txt diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/favicon-16x16.png b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/favicon-16x16.png rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/favicon.ico b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/favicon.ico rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/favicon.ico diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/index.html b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/index.html rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/index.html diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/index.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/ui/index.txt rename to bentoml/bentos/llama2/7b-chat-awq-4bit-753b/src/ui/index.txt diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/README.md b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/README.md deleted file mode 100644 index 144fcda6..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama2:7b-chat-awq-4bit-c733 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/docker/Dockerfile b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/install.sh b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.lock.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.txt b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/service.py b/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama2/7b-chat-awq-4bit-c733/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/docker/Dockerfile b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/install.sh b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.lock.txt b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.txt b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bentofile.yaml b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bentofile.yaml deleted file mode 100644 index 4a0b2db3..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: meta-llama/Llama-2-7b-chat-hf - openllm_alias: 7b,7b-chat - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/service.py b/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/README.md b/bentoml/bentos/llama2/7b-chat-fp16-dc53/README.md similarity index 91% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/README.md rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/README.md index d9b66481..0b8ff92c 100644 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/README.md +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/README.md @@ -1,6 +1,6 @@ -# llama2:7b-chat-fp16-b8c6 +# llama2:7b-chat-fp16-dc53 -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/apis/openapi.yaml b/bentoml/bentos/llama2/7b-chat-fp16-dc53/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/apis/openapi.yaml rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/apis/openapi.yaml index 47973a92..aa0a2f99 100644 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/apis/openapi.yaml +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/apis/schema.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/apis/schema.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/apis/schema.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/apis/schema.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/bento.yaml b/bentoml/bentos/llama2/7b-chat-fp16-dc53/bento.yaml similarity index 96% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/bento.yaml rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/bento.yaml index f01c67bb..e0368295 100644 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/bento.yaml +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama2 -version: 7b-chat-fp16-b8c6 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:17:41.042045+00:00' +version: 7b-chat-fp16-dc53 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:14.322371+00:00' labels: model_name: meta-llama/Llama-2-7b-chat-hf openllm_alias: 7b,7b-chat diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/docker/Dockerfile b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/docker/entrypoint.sh b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/docker/entrypoint.sh rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/install.sh b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.lock.txt b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.txt b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/version.txt b/bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/env/python/version.txt rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/env/python/version.txt diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/bentofile.yaml b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/bentofile.yaml new file mode 100644 index 00000000..511759ef --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: meta-llama/Llama-2-7b-chat-hf + openllm_alias: 7b,7b-chat + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bento_constants.py b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bento_constants.py rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/openllm_config.yaml index f1ea8ac7..e9fcf90d 100644 --- a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/bento_constants.py +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: llama-2-chat engine_config: dtype: half @@ -16,5 +14,3 @@ gpu_type: nvidia-tesla-t4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/service.py b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/404.html b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/404.html rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/404.html diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/apple-touch-icon.png b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/chat.html b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/chat.html rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/chat.html diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/chat.txt b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/chat.txt rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/chat.txt diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/favicon-16x16.png b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/favicon-16x16.png rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/favicon.ico b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/favicon.ico rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/favicon.ico diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/index.html b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/index.html rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/index.html diff --git a/bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/index.txt b/bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama2/7b-chat-fp16-b8c6/src/ui/index.txt rename to bentoml/bentos/llama2/7b-chat-fp16-dc53/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/README.md b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/README.md new file mode 100644 index 00000000..e94ca107 --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/README.md @@ -0,0 +1,16 @@ +# llama3.1:405b-instruct-awq-4bit-675e + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/apis/openapi.yaml b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/apis/openapi.yaml rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/apis/openapi.yaml index 2d3fddd7..2ee51190 100644 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/apis/openapi.yaml +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/apis/schema.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/apis/schema.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/apis/schema.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/bento.yaml b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/bento.yaml similarity index 96% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/bento.yaml rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/bento.yaml index e82c60c2..27dcce0b 100644 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/bento.yaml +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3.1 -version: 405b-instruct-awq-4bit-a733 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:21.483210+00:00' +version: 405b-instruct-awq-4bit-675e +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:42:15.462740+00:00' labels: model_name: hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4 openllm_alias: 405b-4bit,405b-instruct-4bit diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/docker/Dockerfile b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/docker/entrypoint.sh b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/docker/entrypoint.sh rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/install.sh b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/version.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/version.txt rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/env/python/version.txt diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bentofile.yaml b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/bentofile.yaml similarity index 51% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bentofile.yaml rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/bentofile.yaml index 734211cf..b1a20650 100644 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bentofile.yaml +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 405b-4bit,405b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bento_constants.py b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bento_constants.py rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/openllm_config.yaml index 84ef06ab..23da5edb 100644 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/bento_constants.py +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4 @@ -16,5 +14,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/service.py b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/404.html b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/404.html rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/404.html diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/chat.html b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/chat.html rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/chat.html diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/chat.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/chat.txt rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/chat.txt diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/favicon-16x16.png b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/favicon.ico b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/favicon.ico rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/index.html b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/index.html rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/index.html diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/index.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/ui/index.txt rename to bentoml/bentos/llama3.1/405b-instruct-awq-4bit-675e/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/README.md b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/README.md deleted file mode 100644 index adf33a43..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3.1:405b-instruct-awq-4bit-a733 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/docker/Dockerfile b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/install.sh b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.txt b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/service.py b/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3.1/405b-instruct-awq-4bit-a733/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/README.md b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/README.md new file mode 100644 index 00000000..c1b93792 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/README.md @@ -0,0 +1,16 @@ +# llama3.1:70b-instruct-awq-4bit-28ed + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/apis/openapi.yaml b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/apis/openapi.yaml rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/apis/openapi.yaml index 9a2ab420..c09294db 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/apis/openapi.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/apis/schema.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/apis/schema.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/apis/schema.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/bento.yaml b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/bento.yaml similarity index 96% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/bento.yaml rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/bento.yaml index 2760cd87..1ba4768d 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/bento.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3.1 -version: 70b-instruct-awq-4bit-f55b -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:13.962795+00:00' +version: 70b-instruct-awq-4bit-28ed +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:42:07.927075+00:00' labels: model_name: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 openllm_alias: 70b-4bit,70b-instruct-4bit diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/docker/Dockerfile b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/docker/entrypoint.sh b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/docker/entrypoint.sh rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/install.sh b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/version.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/version.txt rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/env/python/version.txt diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bentofile.yaml b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/bentofile.yaml similarity index 51% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bentofile.yaml rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/bentofile.yaml index c6c5fc79..ad9c3f88 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bentofile.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 70b-4bit,70b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bento_constants.py b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bento_constants.py rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/openllm_config.yaml index 745a542f..f7d431b9 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/bento_constants.py +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 @@ -15,5 +13,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/service.py b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/404.html b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/404.html rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/404.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/chat.html b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/chat.html rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/chat.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/chat.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/chat.txt rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/chat.txt diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/favicon-16x16.png b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/favicon.ico b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/favicon.ico rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/index.html b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/index.html rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/index.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/index.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/ui/index.txt rename to bentoml/bentos/llama3.1/70b-instruct-awq-4bit-28ed/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/README.md b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/README.md deleted file mode 100644 index 07d99515..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3.1:70b-instruct-awq-4bit-f55b - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/docker/Dockerfile b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/install.sh b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.txt b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/service.py b/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-awq-4bit-f55b/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/README.md b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/README.md deleted file mode 100644 index b6dd5120..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3.1:70b-instruct-fp16-b665 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/docker/Dockerfile b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/install.sh b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/service.py b/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/README.md b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/README.md new file mode 100644 index 00000000..fbf121be --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/README.md @@ -0,0 +1,16 @@ +# llama3.1:70b-instruct-fp16-b66b + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/apis/openapi.yaml b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/apis/openapi.yaml rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/apis/openapi.yaml index 81d1df50..91dca5ef 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/apis/openapi.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/apis/schema.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/apis/schema.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/apis/schema.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/bento.yaml b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/bento.yaml similarity index 96% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/bento.yaml rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/bento.yaml index 02a19efd..578a5c23 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/bento.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3.1 -version: 70b-instruct-fp16-b665 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:06.699064+00:00' +version: 70b-instruct-fp16-b66b +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:42:00.465297+00:00' labels: model_name: meta-llama/Meta-Llama-3.1-70B-Instruct openllm_alias: 70b,70b-instruct diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/docker/Dockerfile b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/docker/entrypoint.sh b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/docker/entrypoint.sh rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/install.sh b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/version.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/env/python/version.txt rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/env/python/version.txt diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bentofile.yaml b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bentofile.yaml rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/bentofile.yaml index 3e7df81a..047f655f 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bentofile.yaml +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 70b,70b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bento_constants.py b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bento_constants.py rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/openllm_config.yaml index 31978703..72a445a0 100644 --- a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/bento_constants.py +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: meta-llama/Meta-Llama-3.1-70B-Instruct @@ -15,5 +13,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/service.py b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/404.html b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/404.html rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/404.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/chat.html b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/chat.html rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/chat.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/chat.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/chat.txt rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/chat.txt diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/favicon-16x16.png b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/favicon.ico b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/favicon.ico rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/index.html b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/index.html rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/index.html diff --git a/bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/index.txt b/bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3.1/70b-instruct-fp16-b665/src/ui/index.txt rename to bentoml/bentos/llama3.1/70b-instruct-fp16-b66b/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/README.md b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/README.md new file mode 100644 index 00000000..4acf47c2 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/README.md @@ -0,0 +1,16 @@ +# llama3.1:8b-instruct-awq-4bit-5cb2 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/apis/openapi.yaml b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/apis/openapi.yaml rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/apis/openapi.yaml index e0b76b1b..74da5e8b 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/apis/openapi.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/apis/schema.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/apis/schema.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/apis/schema.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/bento.yaml b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/bento.yaml similarity index 96% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/bento.yaml rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/bento.yaml index d82f01b6..340fd30d 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/bento.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3.1 -version: 8b-instruct-awq-4bit-f737 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:59.587834+00:00' +version: 8b-instruct-awq-4bit-5cb2 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:53.094549+00:00' labels: model_name: hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 openllm_alias: 8b-4bit,8b-instruct-4bit diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/docker/Dockerfile b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/docker/entrypoint.sh b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/docker/entrypoint.sh rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/install.sh b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/version.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/version.txt rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/env/python/version.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bentofile.yaml b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/bentofile.yaml similarity index 51% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bentofile.yaml rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/bentofile.yaml index be693b1f..fbab5ffe 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bentofile.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 8b-4bit,8b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bento_constants.py b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bento_constants.py rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/openllm_config.yaml index 1ef7842f..cc5c5227 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/bento_constants.py +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/service.py b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/404.html b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/404.html rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/404.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/chat.html b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/chat.html rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/chat.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/chat.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/chat.txt rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/chat.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/favicon-16x16.png b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/favicon.ico b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/favicon.ico rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/index.html b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/index.html rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/index.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/index.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/ui/index.txt rename to bentoml/bentos/llama3.1/8b-instruct-awq-4bit-5cb2/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/README.md b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/README.md deleted file mode 100644 index 60210933..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3.1:8b-instruct-awq-4bit-f737 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/docker/Dockerfile b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/install.sh b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.txt b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/service.py b/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-awq-4bit-f737/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/README.md b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/README.md new file mode 100644 index 00000000..2bf8bda4 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/README.md @@ -0,0 +1,16 @@ +# llama3.1:8b-instruct-fp16-1c1c + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/apis/openapi.yaml b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/apis/openapi.yaml rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/apis/openapi.yaml index 7f26320a..d2c0d639 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/apis/openapi.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3.1:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/apis/schema.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/apis/schema.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/apis/schema.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/bento.yaml b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/bento.yaml similarity index 96% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/bento.yaml rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/bento.yaml index a3b9d3c9..661b7aa3 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/bento.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3.1 -version: 8b-instruct-fp16-6d7b -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:52.495525+00:00' +version: 8b-instruct-fp16-1c1c +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:45.493664+00:00' labels: model_name: meta-llama/Meta-Llama-3.1-8B-Instruct openllm_alias: 8b,8b-instruct diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/docker/Dockerfile b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/docker/entrypoint.sh b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/docker/entrypoint.sh rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/install.sh b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/version.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/version.txt rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/env/python/version.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bentofile.yaml b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bentofile.yaml rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/bentofile.yaml index 9ea8eba7..3897dcee 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bentofile.yaml +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 8b,8b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bento_constants.py b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bento_constants.py rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/openllm_config.yaml index 95250477..eb744a75 100644 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/bento_constants.py +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/service.py b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/404.html b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/404.html rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/404.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/chat.html b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/chat.html rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/chat.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/chat.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/chat.txt rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/chat.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/favicon-16x16.png b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/favicon.ico b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/favicon.ico rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/index.html b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/index.html rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/index.html diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/index.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/ui/index.txt rename to bentoml/bentos/llama3.1/8b-instruct-fp16-1c1c/src/ui/index.txt diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/README.md b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/README.md deleted file mode 100644 index 3950ba2f..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3.1:8b-instruct-fp16-6d7b - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/docker/Dockerfile b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/install.sh b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.lock.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.txt b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/service.py b/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3.1/8b-instruct-fp16-6d7b/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/README.md b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/README.md deleted file mode 100644 index 8342fec6..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3:70b-instruct-awq-4bit-9204 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/docker/Dockerfile b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/install.sh b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.lock.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/service.py b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/README.md b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/README.md new file mode 100644 index 00000000..e372549a --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/README.md @@ -0,0 +1,16 @@ +# llama3:70b-instruct-awq-4bit-9ceb + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/apis/openapi.yaml b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/apis/openapi.yaml rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/apis/openapi.yaml index 7ccae0e7..6c84187b 100644 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/apis/openapi.yaml +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/apis/schema.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/apis/schema.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/apis/schema.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/bento.yaml b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/bento.yaml similarity index 96% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/bento.yaml rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/bento.yaml index 05c626ed..8f05201d 100644 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/bento.yaml +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3 -version: 70b-instruct-awq-4bit-9204 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:30.974214+00:00' +version: 70b-instruct-awq-4bit-9ceb +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:22.327636+00:00' labels: model_name: casperhansen/llama-3-70b-instruct-awq openllm_alias: 70b-4bit,70b-instruct-4bit diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/docker/Dockerfile b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/docker/entrypoint.sh b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/docker/entrypoint.sh rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/install.sh b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.lock.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/version.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/env/python/version.txt rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/env/python/version.txt diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bentofile.yaml b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bentofile.yaml rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/bentofile.yaml index 702d17df..dfb0083f 100644 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bentofile.yaml +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 70b-4bit,70b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bento_constants.py b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bento_constants.py rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/openllm_config.yaml index de96562d..d5170b3d 100644 --- a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/bento_constants.py +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: casperhansen/llama-3-70b-instruct-awq @@ -15,5 +13,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/service.py b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/404.html b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/404.html rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/404.html diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/chat.html b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/chat.html rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/chat.html diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/chat.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/chat.txt rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/chat.txt diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/favicon-16x16.png b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/favicon.ico b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/favicon.ico rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/index.html b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/index.html rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/index.html diff --git a/bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/index.txt b/bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-awq-4bit-9204/src/ui/index.txt rename to bentoml/bentos/llama3/70b-instruct-awq-4bit-9ceb/src/ui/index.txt diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/README.md b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/README.md deleted file mode 100644 index d92cf19c..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3:70b-instruct-fp16-53f1 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/docker/Dockerfile b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/install.sh b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.lock.txt b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.txt b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/service.py b/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/README.md b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/README.md new file mode 100644 index 00000000..5e18cf1c --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/README.md @@ -0,0 +1,16 @@ +# llama3:70b-instruct-fp16-c3e4 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/apis/openapi.yaml b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/apis/openapi.yaml rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/apis/openapi.yaml index 6ef7b1c9..9b05fe1c 100644 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/apis/openapi.yaml +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/apis/schema.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/apis/schema.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/apis/schema.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/bento.yaml b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/bento.yaml similarity index 96% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/bento.yaml rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/bento.yaml index 9c38d2cc..497d070c 100644 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/bento.yaml +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3 -version: 70b-instruct-fp16-53f1 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:45.363610+00:00' +version: 70b-instruct-fp16-c3e4 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:37.685775+00:00' labels: model_name: meta-llama/Meta-Llama-3-70B-Instruct openllm_alias: 70b,70b-instruct diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/docker/Dockerfile b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/docker/entrypoint.sh b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/docker/entrypoint.sh rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/install.sh b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.lock.txt b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.txt b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/version.txt b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/env/python/version.txt rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/env/python/version.txt diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bentofile.yaml b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bentofile.yaml rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/bentofile.yaml index de5b2eff..088c0863 100644 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bentofile.yaml +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 70b,70b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bento_constants.py b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bento_constants.py rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/openllm_config.yaml index cd7d90ee..40b1fcd2 100644 --- a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/bento_constants.py +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: meta-llama/Meta-Llama-3-70B-Instruct @@ -15,5 +13,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/service.py b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/404.html b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/404.html rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/404.html diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/chat.html b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/chat.html rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/chat.html diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/chat.txt b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/chat.txt rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/chat.txt diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/favicon-16x16.png b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/favicon.ico b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/favicon.ico rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/index.html b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/index.html rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/index.html diff --git a/bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/index.txt b/bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3/70b-instruct-fp16-53f1/src/ui/index.txt rename to bentoml/bentos/llama3/70b-instruct-fp16-c3e4/src/ui/index.txt diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/README.md b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/README.md new file mode 100644 index 00000000..1409e81a --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/README.md @@ -0,0 +1,16 @@ +# llama3:8b-instruct-awq-4bit-1c94 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/apis/openapi.yaml b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/apis/openapi.yaml rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/apis/openapi.yaml index 7d4c8745..d68c4b1c 100644 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/apis/openapi.yaml +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/apis/schema.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/apis/schema.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/apis/schema.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/bento.yaml b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/bento.yaml similarity index 96% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/bento.yaml rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/bento.yaml index 5ac6626d..aaa4ca2c 100644 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/bento.yaml +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3 -version: 8b-instruct-awq-4bit-985b -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:23.860248+00:00' +version: 8b-instruct-awq-4bit-1c94 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:14.470978+00:00' labels: model_name: casperhansen/llama-3-8b-instruct-awq openllm_alias: 8b-4bit,8b-instruct-4bit diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/docker/Dockerfile b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/docker/entrypoint.sh b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/docker/entrypoint.sh rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/install.sh b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.lock.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/version.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/version.txt rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/env/python/version.txt diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bentofile.yaml b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bentofile.yaml rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/bentofile.yaml index ecbf6032..c522578a 100644 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bentofile.yaml +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 8b-4bit,8b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bento_constants.py b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bento_constants.py rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/openllm_config.yaml index a34462d5..0caadc00 100644 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/bento_constants.py +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: casperhansen/llama-3-8b-instruct-awq @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/service.py b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/404.html b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/404.html rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/404.html diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/chat.html b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/chat.html rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/chat.html diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/chat.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/chat.txt rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/chat.txt diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/favicon-16x16.png b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/favicon.ico b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/favicon.ico rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/index.html b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/index.html rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/index.html diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/index.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/ui/index.txt rename to bentoml/bentos/llama3/8b-instruct-awq-4bit-1c94/src/ui/index.txt diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/README.md b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/README.md deleted file mode 100644 index 2ec1dc17..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3:8b-instruct-awq-4bit-985b - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/docker/Dockerfile b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/install.sh b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.lock.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.txt b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/service.py b/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3/8b-instruct-awq-4bit-985b/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/README.md b/bentoml/bentos/llama3/8b-instruct-fp16-8638/README.md deleted file mode 100644 index e2cd176c..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# llama3:8b-instruct-fp16-8638 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/docker/Dockerfile b/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/install.sh b/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.lock.txt b/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.txt b/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/service.py b/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/README.md b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/README.md new file mode 100644 index 00000000..251d1a8a --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/README.md @@ -0,0 +1,16 @@ +# llama3:8b-instruct-fp16-ba7c + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/apis/openapi.yaml b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/apis/openapi.yaml rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/apis/openapi.yaml index addd7eea..0d3a325e 100644 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/apis/openapi.yaml +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# llama3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/apis/schema.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/apis/schema.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/apis/schema.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/apis/schema.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/bento.yaml b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/bento.yaml similarity index 96% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/bento.yaml rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/bento.yaml index 00bf0d29..1423f2d1 100644 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/bento.yaml +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: llama3 -version: 8b-instruct-fp16-8638 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:38.175219+00:00' +version: 8b-instruct-fp16-ba7c +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:29.916452+00:00' labels: model_name: meta-llama/Meta-Llama-3-8B-Instruct openllm_alias: 8b,8b-instruct diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/docker/Dockerfile b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/docker/entrypoint.sh b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/env/docker/entrypoint.sh rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/docker/entrypoint.sh diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/install.sh b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.lock.txt b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.txt b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/version.txt b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/version.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/env/python/version.txt rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/env/python/version.txt diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bentofile.yaml b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bentofile.yaml rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/bentofile.yaml index c0ca84e8..9ae1c02f 100644 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bentofile.yaml +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 8b,8b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bento_constants.py b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bento_constants.py rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/openllm_config.yaml index 172f4bef..3da612d0 100644 --- a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/bento_constants.py +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/service.py b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/404.html b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/404.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/404.html rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/404.html diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/apple-touch-icon.png b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/apple-touch-icon.png rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/chat.html b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/chat.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/chat.html rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/chat.html diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/chat.txt b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/chat.txt rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/chat.txt diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/favicon-16x16.png b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/favicon-16x16.png rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/favicon.ico b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/favicon.ico rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/favicon.ico diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/index.html b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/index.html similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/index.html rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/index.html diff --git a/bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/index.txt b/bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/index.txt similarity index 100% rename from bentoml/bentos/llama3/8b-instruct-fp16-8638/src/ui/index.txt rename to bentoml/bentos/llama3/8b-instruct-fp16-ba7c/src/ui/index.txt diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/README.md b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/README.md new file mode 100644 index 00000000..735c34eb --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/README.md @@ -0,0 +1,16 @@ +# mistral-large:123b-instruct-awq-4bit-c380 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/openapi.yaml b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/openapi.yaml similarity index 96% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/openapi.yaml rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/openapi.yaml index 12864f0f..bdd8d1e4 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/openapi.yaml +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mistral-large:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ @@ -1031,7 +1065,7 @@ info: * [\U0001F4AC Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML\ \ Slack community.\n* [\U0001F41B GitHub Issues](https://github.com/bentoml/BentoML/issues):\ \ Report bugs and feature requests.\n* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description).\n" - title: mistral + title: mistral-large version: None openapi: 3.0.2 paths: @@ -1039,7 +1073,7 @@ paths: post: description: "\n light-weight chat API that takes in a list of messages\ \ and returns a response\n " - operationId: mistral__chat + operationId: mistral-large__chat requestBody: content: application/json: @@ -1103,7 +1137,7 @@ paths: /api/generate: post: description: '' - operationId: mistral__generate + operationId: mistral-large__generate requestBody: content: application/json: diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/schema.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/schema.json similarity index 99% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/schema.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/schema.json index ec7aed4b..7d417819 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/apis/schema.json +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/apis/schema.json @@ -1,5 +1,5 @@ { - "name": "mistral", + "name": "mistral-large", "type": "service", "routes": [ { diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/bento.yaml b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/bento.yaml similarity index 91% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/bento.yaml rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/bento.yaml index 345dfcbc..f1760fe0 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/bento.yaml +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/bento.yaml @@ -1,23 +1,23 @@ service: service:VLLM -name: mistral -version: large-123b-instruct-awq-4bit-ec0c -bentoml_version: 1.3.0 -creation_time: '2024-08-01T18:58:13.620470+00:00' +name: mistral-large +version: 123b-instruct-awq-4bit-c380 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:41:06.651149+00:00' labels: model_name: casperhansen/mistral-large-instruct-2407-awq - openllm_alias: large-4bit,large-instruct-4bit, 123b-4bit, 123b-instruct-4bit + openllm_alias: 123b-4bit,123b-instruct-2407-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat models: [] runners: [] -entry_service: mistral +entry_service: mistral-large services: -- name: mistral +- name: mistral-large service: '' models: [] dependencies: [] config: - name: mistral + name: mistral-large resources: gpu: 1 gpu_type: nvidia-a100-80g @@ -26,7 +26,7 @@ services: envs: - name: HF_TOKEN schema: - name: mistral + name: mistral-large type: service routes: - name: chat diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/docker/Dockerfile b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/docker/entrypoint.sh b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/docker/entrypoint.sh rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/install.sh b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.lock.txt b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.txt b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/version.txt b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/version.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/version.txt rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/env/python/version.txt diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/bentofile.yaml b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/bentofile.yaml new file mode 100644 index 00000000..52262518 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: casperhansen/mistral-large-instruct-2407-awq + openllm_alias: 123b-4bit,123b-instruct-2407-4bit + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bento_constants.py b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/openllm_config.yaml similarity index 72% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bento_constants.py rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/openllm_config.yaml index df4d1968..46fe0abd 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bento_constants.py +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: dtype: half @@ -7,14 +5,12 @@ model: casperhansen/mistral-large-instruct-2407-awq extra_labels: model_name: casperhansen/mistral-large-instruct-2407-awq - openllm_alias: large-4bit,large-instruct-4bit, 123b-4bit, 123b-instruct-4bit + openllm_alias: 123b-4bit,123b-instruct-2407-4bit project: vllm-chat service_config: - name: mistral + name: mistral-large resources: gpu: 1 gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/service.py b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/404.html b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/404.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/404.html rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/404.html diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/apple-touch-icon.png b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/apple-touch-icon.png rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/chat.html b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/chat.html rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/chat.html diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/chat.txt b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/chat.txt rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/chat.txt diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/favicon-16x16.png b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/favicon-16x16.png rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/favicon.ico b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/favicon.ico rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/favicon.ico diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/index.html b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/index.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/index.html rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/index.html diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/index.txt b/bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/ui/index.txt rename to bentoml/bentos/mistral-large/123b-instruct-awq-4bit-c380/src/ui/index.txt diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/README.md b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/README.md new file mode 100644 index 00000000..a06f326a --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/README.md @@ -0,0 +1,16 @@ +# mistral-large:123b-instruct-fp16-a203 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/openapi.yaml b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/openapi.yaml similarity index 96% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/openapi.yaml rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/openapi.yaml index 9293f2e7..307e3bcd 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/openapi.yaml +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mistral-large:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ @@ -1031,7 +1065,7 @@ info: * [\U0001F4AC Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML\ \ Slack community.\n* [\U0001F41B GitHub Issues](https://github.com/bentoml/BentoML/issues):\ \ Report bugs and feature requests.\n* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description).\n" - title: mistral + title: mistral-large version: None openapi: 3.0.2 paths: @@ -1039,7 +1073,7 @@ paths: post: description: "\n light-weight chat API that takes in a list of messages\ \ and returns a response\n " - operationId: mistral__chat + operationId: mistral-large__chat requestBody: content: application/json: @@ -1103,7 +1137,7 @@ paths: /api/generate: post: description: '' - operationId: mistral__generate + operationId: mistral-large__generate requestBody: content: application/json: diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/schema.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/schema.json similarity index 99% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/schema.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/schema.json index a554f8c7..e65f5178 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/apis/schema.json +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/apis/schema.json @@ -1,5 +1,5 @@ { - "name": "mistral", + "name": "mistral-large", "type": "service", "routes": [ { diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/bento.yaml b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/bento.yaml similarity index 91% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/bento.yaml rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/bento.yaml index 21e2b6ed..9c91a323 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/bento.yaml +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/bento.yaml @@ -1,23 +1,23 @@ service: service:VLLM -name: mistral -version: large-123b-instruct-fp16-cadc -bentoml_version: 1.3.0 -creation_time: '2024-08-01T18:58:04.956406+00:00' +name: mistral-large +version: 123b-instruct-fp16-a203 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:58.981250+00:00' labels: model_name: mistralai/Mistral-Large-Instruct-2407 - openllm_alias: large,large-instruct, 123b, 123b-instruct + openllm_alias: 123b, 123b-instruct-2407 platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat models: [] runners: [] -entry_service: mistral +entry_service: mistral-large services: -- name: mistral +- name: mistral-large service: '' models: [] dependencies: [] config: - name: mistral + name: mistral-large resources: gpu: 4 gpu_type: nvidia-a100-80g @@ -26,7 +26,7 @@ services: envs: - name: HF_TOKEN schema: - name: mistral + name: mistral-large type: service routes: - name: chat diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/docker/Dockerfile b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/docker/entrypoint.sh b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/env/docker/entrypoint.sh rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/install.sh b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.lock.txt b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.txt b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/version.txt b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/version.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/version.txt rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/env/python/version.txt diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/bentofile.yaml b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/bentofile.yaml new file mode 100644 index 00000000..02125592 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: mistralai/Mistral-Large-Instruct-2407 + openllm_alias: 123b, 123b-instruct-2407 + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bento_constants.py b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/openllm_config.yaml similarity index 77% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bento_constants.py rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/openllm_config.yaml index 16593514..08d6bcb1 100644 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bento_constants.py +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: dtype: half @@ -8,14 +6,12 @@ tensor_parallel_size: 4 extra_labels: model_name: mistralai/Mistral-Large-Instruct-2407 - openllm_alias: large,large-instruct, 123b, 123b-instruct + openllm_alias: 123b, 123b-instruct-2407 project: vllm-chat service_config: - name: mistral + name: mistral-large resources: gpu: 4 gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/service.py b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/404.html b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/404.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/404.html rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/404.html diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/apple-touch-icon.png b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/apple-touch-icon.png rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/chat.html b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/chat.html rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/chat.html diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/chat.txt b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/chat.txt rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/chat.txt diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/favicon-16x16.png b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/favicon-16x16.png rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/favicon.ico b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/favicon.ico rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/favicon.ico diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/index.html b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/index.html similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/index.html rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/index.html diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/index.txt b/bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/ui/index.txt rename to bentoml/bentos/mistral-large/123b-instruct-fp16-a203/src/ui/index.txt diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/README.md b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/README.md deleted file mode 100644 index 17befb2a..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mistral:7b-instruct-awq-4bit-332d - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/docker/Dockerfile b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/install.sh b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.lock.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/service.py b/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/README.md b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/README.md new file mode 100644 index 00000000..30a356b2 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/README.md @@ -0,0 +1,16 @@ +# mistral:7b-instruct-awq-4bit-4406 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/apis/openapi.yaml b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/apis/openapi.yaml rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/apis/openapi.yaml index c8cb72e6..80008ce9 100644 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/apis/openapi.yaml +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/apis/schema.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/apis/schema.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/apis/schema.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/apis/schema.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/bento.yaml b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/bento.yaml similarity index 96% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/bento.yaml rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/bento.yaml index ea480278..85b0a02c 100644 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/bento.yaml +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: mistral -version: 7b-instruct-awq-4bit-332d -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:09.582213+00:00' +version: 7b-instruct-awq-4bit-4406 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:44.074293+00:00' labels: model_name: TheBloke/Mistral-7B-Instruct-v0.1-AWQ openllm_alias: 7b-4bit,7b-instruct-4bit diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/docker/Dockerfile b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/docker/entrypoint.sh b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/docker/entrypoint.sh rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/install.sh b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.lock.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/version.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/version.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/version.txt rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/env/python/version.txt diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bentofile.yaml b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bentofile.yaml rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/bentofile.yaml index a7295060..c2b84d0b 100644 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bentofile.yaml +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 7b-4bit,7b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bento_constants.py b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/openllm_config.yaml similarity index 94% rename from bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bento_constants.py rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/openllm_config.yaml index 134f905c..8e8560b4 100644 --- a/bentoml/bentos/mistral/7b-instruct-awq-4bit-332d/src/bento_constants.py +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: dtype: half @@ -18,5 +16,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/service.py b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/404.html b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/404.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/404.html rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/404.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/apple-touch-icon.png b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/apple-touch-icon.png rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/chat.html b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/chat.html rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/chat.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/chat.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/chat.txt rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/chat.txt diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/favicon-16x16.png b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/favicon-16x16.png rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/favicon.ico b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/favicon.ico rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/favicon.ico diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/index.html b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/index.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/index.html rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/index.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/index.txt b/bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/ui/index.txt rename to bentoml/bentos/mistral/7b-instruct-awq-4bit-4406/src/ui/index.txt diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/README.md b/bentoml/bentos/mistral/7b-instruct-fp16-c489/README.md deleted file mode 100644 index bb0e8c2f..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mistral:7b-instruct-fp16-c489 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/docker/Dockerfile b/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/install.sh b/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.lock.txt b/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.txt b/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/service.py b/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/README.md b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/README.md new file mode 100644 index 00000000..addf42a4 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/README.md @@ -0,0 +1,16 @@ +# mistral:7b-instruct-fp16-e3bd + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/apis/openapi.yaml b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/apis/openapi.yaml rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/apis/openapi.yaml index 25f82727..4f1c59c9 100644 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/apis/openapi.yaml +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mistral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/apis/schema.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/apis/schema.json similarity index 100% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/apis/schema.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/apis/schema.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/bento.yaml b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/bento.yaml similarity index 96% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/bento.yaml rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/bento.yaml index 0fb5dfe7..09944b01 100644 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/bento.yaml +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: mistral -version: 7b-instruct-fp16-c489 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:18:16.681630+00:00' +version: 7b-instruct-fp16-e3bd +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:51.450444+00:00' labels: model_name: mistralai/Mistral-7B-Instruct-v0.1 openllm_alias: 7b,7b-instruct diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/docker/Dockerfile b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/docker/entrypoint.sh b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/docker/entrypoint.sh rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/install.sh b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.lock.txt b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.txt b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/version.txt b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/version.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/version.txt rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/env/python/version.txt diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bentofile.yaml b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bentofile.yaml rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/bentofile.yaml index ea7d2740..eb86d365 100644 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bentofile.yaml +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 7b,7b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bento_constants.py b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bento_constants.py rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/openllm_config.yaml index 08f18c62..9e8765eb 100644 --- a/bentoml/bentos/mistral/7b-instruct-fp16-c489/src/bento_constants.py +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: dtype: half @@ -17,5 +15,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/service.py b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/404.html b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/404.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/404.html rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/404.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/apple-touch-icon.png b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/apple-touch-icon.png rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/chat.html b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/chat.html rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/chat.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/chat.txt b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/chat.txt rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/chat.txt diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/favicon-16x16.png b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/favicon-16x16.png rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/favicon.ico b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/favicon.ico rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/favicon.ico diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/index.html b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/index.html similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/index.html rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/index.html diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/index.txt b/bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/ui/index.txt rename to bentoml/bentos/mistral/7b-instruct-fp16-e3bd/src/ui/index.txt diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/README.md b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/README.md deleted file mode 100644 index 3592c06d..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mistral:large-123b-instruct-awq-4bit-ec0c - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/docker/Dockerfile b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/install.sh b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.txt b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bentofile.yaml b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bentofile.yaml deleted file mode 100644 index 37115eef..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: casperhansen/mistral-large-instruct-2407-awq - openllm_alias: large-4bit,large-instruct-4bit, 123b-4bit, 123b-instruct-4bit - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/service.py b/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-awq-4bit-ec0c/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/README.md b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/README.md deleted file mode 100644 index f544cd48..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mistral:large-123b-instruct-fp16-cadc - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/docker/Dockerfile b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/install.sh b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.txt b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bentofile.yaml b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bentofile.yaml deleted file mode 100644 index 7ae74914..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: mistralai/Mistral-Large-Instruct-2407 - openllm_alias: large,large-instruct, 123b, 123b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/service.py b/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mistral/large-123b-instruct-fp16-cadc/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/README.md b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/README.md deleted file mode 100644 index ee2638b5..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mixtral:8x7b-instruct-v0.1-awq-4bit-7682 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/docker/Dockerfile b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/install.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.lock.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bentofile.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bentofile.yaml deleted file mode 100644 index 4f0beb9c..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: casperhansen/mixtral-instruct-awq - openllm_alias: 8x7b-4bit - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/service.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/README.md b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/README.md new file mode 100644 index 00000000..c184843c --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/README.md @@ -0,0 +1,16 @@ +# mixtral:8x7b-instruct-v0.1-awq-4bit-7bae + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/apis/openapi.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/apis/openapi.yaml rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/apis/openapi.yaml index 826a13d9..98a3f323 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/apis/openapi.yaml +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mixtral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mixtral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/apis/schema.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/apis/schema.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/apis/schema.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/apis/schema.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/bento.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/bento.yaml similarity index 96% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/bento.yaml rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/bento.yaml index 7ca8ee04..c0e2845b 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/bento.yaml +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: mixtral -version: 8x7b-instruct-v0.1-awq-4bit-7682 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:57.365133+00:00' +version: 8x7b-instruct-v0.1-awq-4bit-7bae +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:30.972889+00:00' labels: model_name: casperhansen/mixtral-instruct-awq openllm_alias: 8x7b-4bit diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/docker/Dockerfile b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/docker/entrypoint.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/docker/entrypoint.sh rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/install.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.lock.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/version.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/version.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/env/python/version.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/env/python/version.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/bentofile.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/bentofile.yaml new file mode 100644 index 00000000..50d2017c --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: casperhansen/mixtral-instruct-awq + openllm_alias: 8x7b-4bit + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bento_constants.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bento_constants.py rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/openllm_config.yaml index eb42ae8c..8d08c752 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/bento_constants.py +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: gpu_memory_utilization: 0.8 @@ -17,5 +15,3 @@ gpu_type: nvidia-tesla-a100 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/service.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/404.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/404.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/404.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/404.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/apple-touch-icon.png b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/apple-touch-icon.png rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/chat.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/chat.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/chat.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/chat.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/chat.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/chat.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/favicon-16x16.png b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/favicon-16x16.png rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/favicon.ico b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/favicon.ico rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/favicon.ico diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/index.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/index.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/index.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/index.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/index.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7682/src/ui/index.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-awq-4bit-7bae/src/ui/index.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/README.md b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/README.md new file mode 100644 index 00000000..e1b35cc3 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/README.md @@ -0,0 +1,16 @@ +# mixtral:8x7b-instruct-v0.1-fp16-1c82 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/apis/openapi.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/apis/openapi.yaml rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/apis/openapi.yaml index d6b704f2..f319ae44 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/apis/openapi.yaml +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# mixtral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# mixtral:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/apis/schema.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/apis/schema.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/apis/schema.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/apis/schema.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/bento.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/bento.yaml similarity index 96% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/bento.yaml rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/bento.yaml index cf199f3a..0f4603e6 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/bento.yaml +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: mixtral -version: 8x7b-instruct-v0.1-fp16-572d -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:19:50.280555+00:00' +version: 8x7b-instruct-v0.1-fp16-1c82 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:23.449505+00:00' labels: model_name: mistralai/Mixtral-8x7B-Instruct-v0.1 openllm_alias: 8x7b,8x7b-instruct diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/docker/Dockerfile b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/docker/entrypoint.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/docker/entrypoint.sh rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/docker/entrypoint.sh diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/install.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.lock.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/version.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/version.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/version.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/env/python/version.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bentofile.yaml b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bentofile.yaml rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/bentofile.yaml index a6c53ce6..effc9f2d 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bentofile.yaml +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 8x7b,8x7b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bento_constants.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bento_constants.py rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/openllm_config.yaml index eda643de..cb2aae58 100644 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/bento_constants.py +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: mistral-instruct engine_config: max_model_len: 2048 @@ -16,5 +14,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/service.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/404.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/404.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/404.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/404.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/apple-touch-icon.png b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/apple-touch-icon.png rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/chat.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/chat.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/chat.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/chat.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/chat.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/chat.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/chat.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/favicon-16x16.png b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/favicon-16x16.png rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/favicon.ico b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/favicon.ico rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/favicon.ico diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/index.html b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/index.html similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/index.html rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/index.html diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/index.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/index.txt similarity index 100% rename from bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/ui/index.txt rename to bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-1c82/src/ui/index.txt diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/README.md b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/README.md deleted file mode 100644 index 9191b986..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mixtral:8x7b-instruct-v0.1-fp16-572d - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/docker/Dockerfile b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/install.sh b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.lock.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.txt b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/service.py b/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/mixtral/8x7b-instruct-v0.1-fp16-572d/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/README.md b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/README.md new file mode 100644 index 00000000..3a96a4bc --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/README.md @@ -0,0 +1,16 @@ +# phi3:3.8b-instruct-fp16-37b9 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/apis/openapi.yaml b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/apis/openapi.yaml rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/apis/openapi.yaml index 9506df99..8f0ffd7f 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/apis/openapi.yaml +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# phi3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# phi3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/apis/schema.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/apis/schema.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/apis/schema.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/apis/schema.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/bento.yaml b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/bento.yaml similarity index 96% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/bento.yaml rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/bento.yaml index 8dbb8b54..5478c9eb 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/bento.yaml +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: phi3 -version: 3.8b-instruct-fp16-c4d8 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:17:33.918787+00:00' +version: 3.8b-instruct-fp16-37b9 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:40:06.573185+00:00' labels: model_name: microsoft/Phi-3-mini-4k-instruct openllm_alias: 3.8b,3.8b-mini,3.8b-mini-instruct-4k-fp16 diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/docker/Dockerfile b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/docker/entrypoint.sh b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/docker/entrypoint.sh rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/docker/entrypoint.sh diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/install.sh b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.lock.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/version.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/version.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/version.txt rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/env/python/version.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bentofile.yaml b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/bentofile.yaml similarity index 51% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bentofile.yaml rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/bentofile.yaml index d1ad606e..b270cd98 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bentofile.yaml +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 3.8b,3.8b-mini,3.8b-mini-instruct-4k-fp16 platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bento_constants.py b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bento_constants.py rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/openllm_config.yaml index 4a5496ee..275eccb8 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/bento_constants.py +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: phi-3 engine_config: dtype: half @@ -16,5 +14,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/service.py b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/404.html b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/404.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/404.html rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/404.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/apple-touch-icon.png b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/apple-touch-icon.png rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/chat.html b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/chat.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/chat.html rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/chat.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/chat.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/chat.txt rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/chat.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/favicon-16x16.png b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/favicon-16x16.png rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/favicon.ico b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/favicon.ico rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/favicon.ico diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/index.html b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/index.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/index.html rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/index.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/index.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/index.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/ui/index.txt rename to bentoml/bentos/phi3/3.8b-instruct-fp16-37b9/src/ui/index.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/README.md b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/README.md deleted file mode 100644 index 42e5cb9b..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# phi3:3.8b-instruct-fp16-c4d8 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/docker/Dockerfile b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/install.sh b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.lock.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.txt b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/service.py b/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-fp16-c4d8/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/README.md b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/README.md new file mode 100644 index 00000000..3c87b558 --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/README.md @@ -0,0 +1,16 @@ +# phi3:3.8b-instruct-ggml-q4-cf55 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/apis/openapi.yaml b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/apis/openapi.yaml similarity index 99% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/apis/openapi.yaml rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/apis/openapi.yaml index 44292c86..3d42f87f 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/apis/openapi.yaml +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/apis/openapi.yaml @@ -172,7 +172,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# phi3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# phi3:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/apis/schema.json b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/apis/schema.json similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/apis/schema.json rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/apis/schema.json diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/bento.yaml b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/bento.yaml similarity index 96% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/bento.yaml rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/bento.yaml index c71bcac1..0092ad61 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/bento.yaml +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/bento.yaml @@ -1,8 +1,8 @@ service: service:LlamaCppChat name: phi3 -version: 3.8b-instruct-ggml-q4-f5db -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:50.268237+00:00' +version: 3.8b-instruct-ggml-q4-cf55 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:45:13.051548+00:00' labels: model_name: microsoft/Phi-3-mini-4k-instruct-gguf openllm_alias: 3.8b-ggml-q4,3.8b-mini-instruct-4k-ggml-q4 diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/docker/Dockerfile b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/docker/Dockerfile similarity index 99% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/docker/Dockerfile rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/docker/Dockerfile index 8b20c02f..befd9b90 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/docker/Dockerfile +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/docker/Dockerfile @@ -33,7 +33,6 @@ RUN curl -LO https://astral.sh/uv/install.sh && \ ARG BENTO_USER=bentoml ARG BENTO_USER_UID=1034 ARG BENTO_USER_GID=1034 - RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER ARG CMAKE_ARGS=-DLLAMA_METAL=on ENV CMAKE_ARGS=$CMAKE_ARGS diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/docker/entrypoint.sh b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/docker/entrypoint.sh rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/docker/entrypoint.sh diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/install.sh b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.lock.txt b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.lock.txt similarity index 80% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.lock.txt rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.lock.txt index a071d3e8..e88e5bd7 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.lock.txt +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.lock.txt @@ -1,4 +1,5 @@ -aiohttp==3.9.5 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 aiosqlite==0.20.0 annotated-types==0.7.0 @@ -6,8 +7,8 @@ anyio==4.4.0 appdirs==1.4.4 asgiref==3.8.1 async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 +attrs==24.2.0 +bentoml==1.3.1 cattrs==23.1.2 certifi==2024.7.4 charset-normalizer==3.3.2 @@ -18,21 +19,17 @@ cloudpickle==3.0.0 deepmerge==1.1.1 deprecated==1.2.14 diskcache==5.6.3 -dnspython==2.6.1 -email-validator==2.2.0 exceptiongroup==1.2.2 -fastapi==0.111.1 -fastapi-cli==0.0.4 +fastapi==0.112.0 filelock==3.15.4 frozenlist==1.4.1 fs==2.4.16 fsspec==2024.6.1 h11==0.14.0 httpcore==1.0.5 -httptools==0.6.1 httpx==0.27.0 httpx-ws==0.6.0 -huggingface-hub==0.24.1 +huggingface-hub==0.24.5 idna==3.7 importlib-metadata==6.11.0 inflection==0.5.1 @@ -64,16 +61,14 @@ pydantic-core==2.20.1 pygments==2.18.0 pyparsing==3.1.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 python-json-logger==2.0.7 python-multipart==0.0.9 -pyyaml==6.0.1 -pyzmq==26.0.3 +pyyaml==6.0.2 +pyzmq==26.1.0 requests==2.32.3 rich==13.7.1 schema==0.7.7 -setuptools==71.1.0 -shellingham==1.5.4 +setuptools==72.1.0 simple-di==0.1.5 six==1.16.0 sniffio==1.3.1 @@ -81,17 +76,14 @@ starlette==0.37.2 tomli==2.0.1 tomli-w==1.0.0 tornado==6.4.1 -tqdm==4.66.4 -typer==0.12.3 +tqdm==4.66.5 typing-extensions==4.12.2 urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -watchfiles==0.22.0 +uv==0.2.35 +uvicorn==0.30.5 +watchfiles==0.23.0 wcwidth==0.2.13 -websockets==12.0 wrapt==1.16.0 wsproto==1.2.0 yarl==1.9.4 -zipp==3.19.2 +zipp==3.20.0 diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.txt b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.txt similarity index 76% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.txt rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.txt index 44b51631..45bae22c 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/requirements.txt +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/requirements.txt @@ -1,4 +1,4 @@ -bentoml==1.3.0 +bentoml==1.3.1 huggingface-hub llama_cpp_python==0.2.79 fastapi diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/version.txt b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/version.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/version.txt rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/env/python/version.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/bentofile.yaml b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/bentofile.yaml new file mode 100644 index 00000000..839b3aeb --- /dev/null +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/bentofile.yaml @@ -0,0 +1,16 @@ +envs: +- name: CMAKE_ARGS + value: -DLLAMA_METAL=on +include: +- '*.py' +- '*.yaml' +- ui/* +labels: + model_name: microsoft/Phi-3-mini-4k-instruct-gguf + openllm_alias: 3.8b-ggml-q4,3.8b-mini-instruct-4k-ggml-q4 + platforms: macos,linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/llamacpp-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:LlamaCppChat diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bento_constants.py b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bento_constants.py rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/openllm_config.yaml index 6243021a..0d83c324 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bento_constants.py +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' chat_template: phi-3 engine_config: max_model_len: 2048 @@ -14,5 +12,3 @@ memory: 3Gi traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/service.py b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/service.py similarity index 95% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/service.py rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/service.py index 7a9b5544..4ec07067 100644 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/service.py +++ b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/service.py @@ -3,7 +3,6 @@ from typing_extensions import Annotated from llama_cpp import Llama from typing import AsyncGenerator, Optional -from bento_constants import CONSTANT_YAML import yaml import fastapi import fastapi.staticfiles @@ -20,7 +19,10 @@ Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.If you don't know the answer to a question, please don't share false information """ -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/404.html b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/404.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/404.html rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/404.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/apple-touch-icon.png b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/apple-touch-icon.png rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/chat.html b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/chat.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/chat.html rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/chat.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/chat.txt b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/chat.txt rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/chat.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/favicon-16x16.png b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/favicon-16x16.png rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/favicon.ico b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/favicon.ico rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/favicon.ico diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/index.html b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/index.html similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/index.html rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/index.html diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/index.txt b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/index.txt similarity index 100% rename from bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/ui/index.txt rename to bentoml/bentos/phi3/3.8b-instruct-ggml-q4-cf55/src/ui/index.txt diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/README.md b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/README.md deleted file mode 100644 index 1ea73862..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# phi3:3.8b-instruct-ggml-q4-f5db - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/install.sh b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bentofile.yaml b/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bentofile.yaml deleted file mode 100644 index 8ff9ccf0..00000000 --- a/bentoml/bentos/phi3/3.8b-instruct-ggml-q4-f5db/src/bentofile.yaml +++ /dev/null @@ -1,43 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - CMAKE_ARGS: -DLLAMA_METAL=on - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: CMAKE_ARGS - value: -DLLAMA_METAL=on -exclude: [] -include: -- '*.py' -- ui/* -labels: - model_name: microsoft/Phi-3-mini-4k-instruct-gguf - openllm_alias: 3.8b-ggml-q4,3.8b-mini-instruct-4k-ggml-q4 - platforms: macos,linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/llamacpp-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:LlamaCppChat diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/README.md b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/README.md deleted file mode 100644 index a7a4b949..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:0.5b-instruct-fp16-0bca - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/docker/Dockerfile b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/install.sh b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bentofile.yaml b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bentofile.yaml deleted file mode 100644 index 67df231f..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: Qwen/Qwen2-0.5B-Instruct - openllm_alias: 0.5b,0.5b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/service.py b/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/README.md b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/README.md new file mode 100644 index 00000000..b908091b --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/README.md @@ -0,0 +1,16 @@ +# qwen2:0.5b-instruct-fp16-bca0 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/apis/openapi.yaml b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/apis/openapi.yaml rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/apis/openapi.yaml index b1ec21b1..b4e145e6 100644 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/apis/schema.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/apis/schema.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/apis/schema.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/bento.yaml b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/bento.yaml rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/bento.yaml index 2482dcf9..56615ae5 100644 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/bento.yaml +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 0.5b-instruct-fp16-0bca -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:04.663421+00:00' +version: 0.5b-instruct-fp16-bca0 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:38.459991+00:00' labels: model_name: Qwen/Qwen2-0.5B-Instruct openllm_alias: 0.5b,0.5b-instruct diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/docker/Dockerfile b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/install.sh b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/version.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/env/python/version.txt rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/env/python/version.txt diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/bentofile.yaml b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/bentofile.yaml new file mode 100644 index 00000000..e3fac140 --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: Qwen/Qwen2-0.5B-Instruct + openllm_alias: 0.5b,0.5b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bento_constants.py b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bento_constants.py rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/openllm_config.yaml index 95732c26..5d968547 100644 --- a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/bento_constants.py +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/service.py b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/404.html b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/404.html rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/404.html diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/chat.html b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/chat.html rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/chat.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/chat.txt rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/favicon.ico b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/favicon.ico rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/index.html b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/index.html rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/index.html diff --git a/bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/index.txt b/bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/0.5b-instruct-fp16-0bca/src/ui/index.txt rename to bentoml/bentos/qwen2/0.5b-instruct-fp16-bca0/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/README.md b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/README.md new file mode 100644 index 00000000..097fb3a5 --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/README.md @@ -0,0 +1,16 @@ +# qwen2:1.5b-instruct-fp16-df66 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/apis/openapi.yaml b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/apis/openapi.yaml rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/apis/openapi.yaml index e2d5cae5..7ca7160a 100644 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/apis/schema.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/apis/schema.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/apis/schema.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/bento.yaml b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/bento.yaml rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/bento.yaml index b6d6bc0f..e821014b 100644 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/bento.yaml +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 1.5b-instruct-fp16-f784 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:11.672283+00:00' +version: 1.5b-instruct-fp16-df66 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:45.828141+00:00' labels: model_name: Qwen/Qwen2-1.5B-Instruct openllm_alias: 1.5b,1.5b-instruct diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/docker/Dockerfile b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/install.sh b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/version.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/version.txt rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/env/python/version.txt diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/bentofile.yaml b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/bentofile.yaml new file mode 100644 index 00000000..c3d4cc9d --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: Qwen/Qwen2-1.5B-Instruct + openllm_alias: 1.5b,1.5b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bento_constants.py b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bento_constants.py rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/openllm_config.yaml index b171f52f..c636dfee 100644 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bento_constants.py +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/service.py b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/404.html b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/404.html rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/404.html diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/chat.html b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/chat.html rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/chat.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/chat.txt rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/favicon.ico b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/favicon.ico rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/index.html b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/index.html rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/index.html diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/index.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/ui/index.txt rename to bentoml/bentos/qwen2/1.5b-instruct-fp16-df66/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/README.md b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/README.md deleted file mode 100644 index eba5e021..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:1.5b-instruct-fp16-f784 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/docker/Dockerfile b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/install.sh b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.txt b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bentofile.yaml b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bentofile.yaml deleted file mode 100644 index 22daccad..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: Qwen/Qwen2-1.5B-Instruct - openllm_alias: 1.5b,1.5b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/service.py b/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/1.5b-instruct-fp16-f784/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/README.md b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/README.md deleted file mode 100644 index 0a760f52..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:57b-a14b-instruct-fp16-4dcd - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/docker/Dockerfile b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/install.sh b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/service.py b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/README.md b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/README.md new file mode 100644 index 00000000..bc74ac8b --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/README.md @@ -0,0 +1,16 @@ +# qwen2:57b-a14b-instruct-fp16-b847 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/apis/openapi.yaml b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/apis/openapi.yaml rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/apis/openapi.yaml index e017fdb3..32669d58 100644 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/apis/schema.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/apis/schema.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/apis/schema.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/bento.yaml b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/bento.yaml rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/bento.yaml index 928e85a6..1e37b3d4 100644 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/bento.yaml +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 57b-a14b-instruct-fp16-4dcd -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:40.500555+00:00' +version: 57b-a14b-instruct-fp16-b847 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:44:15.113026+00:00' labels: model_name: Qwen/Qwen2-57B-A14B-Instruct openllm_alias: 57b-a14b,57b-a14b-instruct diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/docker/Dockerfile b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/install.sh b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/version.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/env/python/version.txt rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/env/python/version.txt diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bentofile.yaml b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bentofile.yaml rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/bentofile.yaml index 787bb3e2..143d7d5c 100644 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bentofile.yaml +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 57b-a14b,57b-a14b-instruct platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bento_constants.py b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/openllm_config.yaml similarity index 93% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bento_constants.py rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/openllm_config.yaml index b71e2b92..7c3436f2 100644 --- a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/bento_constants.py +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -16,5 +14,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/service.py b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/404.html b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/404.html rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/404.html diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/chat.html b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/chat.html rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/chat.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/chat.txt rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/favicon.ico b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/favicon.ico rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/index.html b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/index.html rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/index.html diff --git a/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/index.txt b/bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/57b-a14b-instruct-fp16-4dcd/src/ui/index.txt rename to bentoml/bentos/qwen2/57b-a14b-instruct-fp16-b847/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/README.md b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/README.md deleted file mode 100644 index d30d1135..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:72b-instruct-awq-4bit-13bf - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/docker/Dockerfile b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/install.sh b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/service.py b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/README.md b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/README.md new file mode 100644 index 00000000..2223dc90 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/README.md @@ -0,0 +1,16 @@ +# qwen2:72b-instruct-awq-4bit-60b1 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/apis/openapi.yaml b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/apis/openapi.yaml rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/apis/openapi.yaml index d657616d..150073a4 100644 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/apis/schema.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/apis/schema.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/apis/schema.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/bento.yaml b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/bento.yaml rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/bento.yaml index 1fc8efbd..ac5c7ca7 100644 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/bento.yaml +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 72b-instruct-awq-4bit-13bf -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:33.368352+00:00' +version: 72b-instruct-awq-4bit-60b1 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:44:07.732866+00:00' labels: model_name: Qwen/Qwen2-72B-Instruct-AWQ openllm_alias: 72b-4bit,72b-instruct-4bit diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/docker/Dockerfile b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/install.sh b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/version.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/env/python/version.txt rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/env/python/version.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bentofile.yaml b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bentofile.yaml rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/bentofile.yaml index 5d151dff..836d0656 100644 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bentofile.yaml +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 72b-4bit,72b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bento_constants.py b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bento_constants.py rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/openllm_config.yaml index b5f6e008..1596e8f2 100644 --- a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/bento_constants.py +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: Qwen/Qwen2-72B-Instruct-AWQ @@ -15,5 +13,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/service.py b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/404.html b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/404.html rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/404.html diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/chat.html b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/chat.html rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/chat.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/chat.txt rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/favicon.ico b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/favicon.ico rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/index.html b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/index.html rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/index.html diff --git a/bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/index.txt b/bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-awq-4bit-13bf/src/ui/index.txt rename to bentoml/bentos/qwen2/72b-instruct-awq-4bit-60b1/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/README.md b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/README.md new file mode 100644 index 00000000..98f86c41 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/README.md @@ -0,0 +1,16 @@ +# qwen2:72b-instruct-fp16-ee8e + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/apis/openapi.yaml b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/apis/openapi.yaml rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/apis/openapi.yaml index e225a4e5..1bf514d3 100644 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/apis/schema.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/apis/schema.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/apis/schema.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/bento.yaml b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/bento.yaml rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/bento.yaml index 2c10252b..dd8bc6f7 100644 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/bento.yaml +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 72b-instruct-fp16-f73f -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:47.668161+00:00' +version: 72b-instruct-fp16-ee8e +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:44:22.528631+00:00' labels: model_name: Qwen/Qwen2-72B-Instruct openllm_alias: 72b,72b-instruct diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/docker/Dockerfile b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/install.sh b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/version.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/version.txt rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/env/python/version.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/bentofile.yaml b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/bentofile.yaml new file mode 100644 index 00000000..98ba9605 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: Qwen/Qwen2-72B-Instruct + openllm_alias: 72b,72b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bento_constants.py b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bento_constants.py rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/openllm_config.yaml index 1eb4d6a0..b1c8714b 100644 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bento_constants.py +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -16,5 +14,3 @@ gpu_type: nvidia-a100-80g traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/service.py b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/404.html b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/404.html rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/404.html diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/chat.html b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/chat.html rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/chat.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/chat.txt rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/favicon.ico b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/favicon.ico rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/index.html b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/index.html rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/index.html diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/index.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/ui/index.txt rename to bentoml/bentos/qwen2/72b-instruct-fp16-ee8e/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/README.md b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/README.md deleted file mode 100644 index d44a8b7e..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:72b-instruct-fp16-f73f - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/docker/Dockerfile b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/install.sh b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.txt b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bentofile.yaml b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bentofile.yaml deleted file mode 100644 index 79afbd9e..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: Qwen/Qwen2-72B-Instruct - openllm_alias: 72b,72b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/service.py b/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/72b-instruct-fp16-f73f/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/README.md b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/README.md new file mode 100644 index 00000000..4a471fd8 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/README.md @@ -0,0 +1,16 @@ +# qwen2:7b-instruct-awq-4bit-02f4 + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/apis/openapi.yaml b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/apis/openapi.yaml rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/apis/openapi.yaml index 1734fba2..92fd240a 100644 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/apis/schema.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/apis/schema.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/apis/schema.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/bento.yaml b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/bento.yaml rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/bento.yaml index c6264051..ee191857 100644 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/bento.yaml +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 7b-instruct-awq-4bit-3150 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:18.931386+00:00' +version: 7b-instruct-awq-4bit-02f4 +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:43:53.062849+00:00' labels: model_name: Qwen/Qwen2-7B-Instruct-AWQ openllm_alias: 7b-4bit,7b-instruct-4bit diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/docker/Dockerfile b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/install.sh b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/version.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/version.txt rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/env/python/version.txt diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bentofile.yaml b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/bentofile.yaml similarity index 50% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bentofile.yaml rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/bentofile.yaml index 4d9237ed..001feb1d 100644 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bentofile.yaml +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/bentofile.yaml @@ -1,24 +1,8 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null envs: - name: HF_TOKEN -exclude: [] include: - '*.py' +- '*.yaml' - ui/* - ui/chunks/* - ui/css/* @@ -32,18 +16,7 @@ labels: openllm_alias: 7b-4bit,7b-instruct-4bit platforms: linux source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null python: - extra_index_url: null - find_links: null - index_url: null lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null requirements_txt: ./requirements.txt - trusted_host: null - wheels: null service: service:VLLM diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bento_constants.py b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/openllm_config.yaml similarity index 92% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bento_constants.py rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/openllm_config.yaml index 5f9317b0..afafb7a2 100644 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/bento_constants.py +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: max_model_len: 2048 model: Qwen/Qwen2-7B-Instruct-AWQ @@ -15,5 +13,3 @@ gpu_type: nvidia-rtx-3060 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/service.py b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/404.html b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/404.html rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/404.html diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/chat.html b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/chat.html rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/chat.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/chat.txt rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/favicon.ico b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/favicon.ico rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/index.html b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/index.html rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/index.html diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/index.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/ui/index.txt rename to bentoml/bentos/qwen2/7b-instruct-awq-4bit-02f4/src/ui/index.txt diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/README.md b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/README.md deleted file mode 100644 index a152ac18..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:7b-instruct-awq-4bit-3150 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/docker/Dockerfile b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/install.sh b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.txt b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/service.py b/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-awq-4bit-3150/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/README.md b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/README.md deleted file mode 100644 index 556dd4e6..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# qwen2:7b-instruct-fp16-0016 - -[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML) -[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) -[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) -[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) -[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) - -This is a Machine Learning Service created with BentoML. - -## Help - -* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. -* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. -* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. -* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/docker/Dockerfile b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/docker/Dockerfile deleted file mode 100644 index 6e67ca1c..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/docker/Dockerfile +++ /dev/null @@ -1,71 +0,0 @@ -# =========================================== -# -# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT -# -# =========================================== - -# Block SETUP_BENTO_BASE_IMAGE -FROM python:3.9-slim as base-container - -ENV LANG=C.UTF-8 - -ENV LC_ALL=C.UTF-8 - -ENV PYTHONIOENCODING=UTF-8 - -ENV PYTHONUNBUFFERED=1 - - - -USER root - -ENV DEBIAN_FRONTEND=noninteractive -RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -RUN set -eux && \ - apt-get update -y && \ - apt-get install -q -y --no-install-recommends --allow-remove-essential \ - ca-certificates gnupg2 bash build-essential curl -ENV UV_SYSTEM_PYTHON=1 -RUN curl -LO https://astral.sh/uv/install.sh && \ - sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ - -# Block SETUP_BENTO_USER -ARG BENTO_USER=bentoml -ARG BENTO_USER_UID=1034 -ARG BENTO_USER_GID=1034 - -RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER -ARG HF_TOKEN= -ENV HF_TOKEN=$HF_TOKEN -ARG BENTO_PATH=/home/bentoml/bento -ENV BENTO_PATH=$BENTO_PATH -ENV BENTOML_HOME=/home/bentoml/ - -RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R -WORKDIR $BENTO_PATH - - -# Block SETUP_BENTO_COMPONENTS - -RUN uv pip install torch==2.3.1 ; exit 0 -RUN uv pip install vllm==0.5.3.post1 ; exit 0 -COPY --chown=bentoml:bentoml ./env/python ./env/python/ -# install python packages with install.sh -RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh -COPY --chown=bentoml:bentoml . ./ - -# Block SETUP_BENTO_ENTRYPOINT -RUN rm -rf /var/lib/{apt,cache,log} -# Default port for BentoServer -EXPOSE 3000 - -# Expose Prometheus port -EXPOSE 3001 - -RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh - -USER bentoml - -ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] - - diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/install.sh b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/install.sh deleted file mode 100644 index 607ee052..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -# Parent directory https://stackoverflow.com/a/246128/8643197 -BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" - -pip_install() { - if command -v "uv" > /dev/null 2>&1; then - uv pip install "$@" - else - pip3 install "$@" - fi -} - -PIP_ARGS=() - -# BentoML by default generates two requirement files: -# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` -# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file -REQUIREMENTS_TXT="$BASEDIR/requirements.txt" -REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" -WHEELS_DIR="$BASEDIR/wheels" -BENTOML_VERSION=${BENTOML_VERSION:-1.3.0} -# Install python packages, prefer installing the requirements.lock.txt file if it exist -pushd "$BASEDIR" &>/dev/null -if [ -f "$REQUIREMENTS_LOCK" ]; then - echo "Installing pip packages from 'requirements.lock.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" -else - if [ -f "$REQUIREMENTS_TXT" ]; then - echo "Installing pip packages from 'requirements.txt'.." - pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" - fi -fi -popd &>/dev/null - -# Attempt to expand the glob pattern. The nullglob option ensures that -# the pattern itself is not returned if no files match. -shopt -s nullglob -wheels=($WHEELS_DIR/*.whl) - -if [ ${#wheels[@]} -gt 0 ]; then - echo "Installing wheels packaged in Bento.." - pip_install "${PIP_ARGS[@]}" "${wheels[@]}" -fi - - -# Install the BentoML from PyPI if it's not already installed -if python3 -c "import bentoml" &> /dev/null; then - existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") - if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then - echo "WARNING: using BentoML version ${existing_bentoml_version}" - fi -else - pip_install bentoml=="$BENTOML_VERSION" -fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.lock.txt deleted file mode 100644 index 0684a350..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.lock.txt +++ /dev/null @@ -1,156 +0,0 @@ -aiohttp==3.9.5 -aiosignal==1.3.1 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.4.0 -appdirs==1.4.4 -asgiref==3.8.1 -async-timeout==4.0.3 -attrs==23.2.0 -bentoml==1.3.0 -cattrs==23.1.2 -certifi==2024.7.4 -charset-normalizer==3.3.2 -circus==0.18.0 -click==8.1.7 -click-option-group==0.5.6 -cloudpickle==3.0.0 -cmake==3.30.1 -datasets==2.14.4 -deepmerge==1.1.1 -deprecated==1.2.14 -dill==0.3.7 -diskcache==5.6.3 -distro==1.9.0 -dnspython==2.6.1 -email-validator==2.2.0 -exceptiongroup==1.2.2 -fastapi==0.111.0 -fastapi-cli==0.0.4 -filelock==3.15.4 -frozenlist==1.4.1 -fs==2.4.16 -fsspec==2024.6.1 -h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -httpx-ws==0.6.0 -huggingface-hub==0.24.1 -idna==3.7 -importlib-metadata==6.11.0 -inflection==0.5.1 -inquirerpy==0.3.4 -interegular==0.3.3 -jinja2==3.1.4 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -lark==1.1.9 -llvmlite==0.43.0 -lm-format-enforcer==0.10.3 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.15 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -numba==0.60.0 -numpy==1.26.0 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-ml-py==11.525.150 -nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.5.82 -nvidia-nvtx-cu12==12.1.105 -openai==1.37.0 -opentelemetry-api==1.20.0 -opentelemetry-instrumentation==0.41b0 -opentelemetry-instrumentation-aiohttp-client==0.41b0 -opentelemetry-instrumentation-asgi==0.41b0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -opentelemetry-util-http==0.41b0 -orjson==3.10.6 -outlines==0.0.46 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -pfzy==0.3.4 -pillow==10.4.0 -pip-requirements-parser==32.0.1 -prometheus-client==0.20.0 -prometheus-fastapi-instrumentator==7.0.0 -prompt-toolkit==3.0.47 -protobuf==5.27.2 -psutil==6.0.0 -py-cpuinfo==9.0.0 -pyairports==2.1.1 -pyarrow==17.0.0 -pycountry==24.6.1 -pydantic==2.8.2 -pydantic-core==2.20.1 -pygments==2.18.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -python-multipart==0.0.9 -pytz==2024.1 -pyyaml==6.0.1 -pyzmq==26.0.3 -ray==2.32.0 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 -safetensors==0.4.3 -schema==0.7.7 -sentencepiece==0.2.0 -setuptools==71.1.0 -shellingham==1.5.4 -simple-di==0.1.5 -six==1.16.0 -sniffio==1.3.1 -starlette==0.37.2 -sympy==1.13.1 -tiktoken==0.7.0 -tokenizers==0.19.1 -tomli==2.0.1 -tomli-w==1.0.0 -torch==2.3.1 -torchvision==0.18.1 -tornado==6.4.1 -tqdm==4.66.4 -transformers==4.43.1 -triton==2.3.1 -typer==0.12.3 -typing-extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -urllib3==2.2.2 -uv==0.2.28 -uvicorn==0.30.3 -uvloop==0.19.0 -vllm==0.5.3.post1 -vllm-flash-attn==2.5.9.post1 -watchfiles==0.22.0 -wcwidth==0.2.13 -websockets==12.0 -wrapt==1.16.0 -wsproto==1.2.0 -xformers==0.0.27 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.19.2 diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.txt deleted file mode 100644 index dc141d0d..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -bentoml==1.3.0 -torch==2.3.1 -vllm==0.5.3.post1 -numpy==1.26.0 -transformers==4.43.1 -fastapi==0.111.0 -pyyaml diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bentofile.yaml b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bentofile.yaml deleted file mode 100644 index bc7e804d..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bentofile.yaml +++ /dev/null @@ -1,49 +0,0 @@ -conda: - channels: null - dependencies: null - environment_yml: null - pip: null -description: null -docker: - base_image: null - cuda_version: null - distro: debian - dockerfile_template: null - env: - HF_TOKEN: '' - python_version: '3.9' - setup_script: null - system_packages: null -envs: -- name: HF_TOKEN -exclude: [] -include: -- '*.py' -- ui/* -- ui/chunks/* -- ui/css/* -- ui/media/* -- ui/chunks/pages/* -- bentovllm_openai/*.py -- chat_templates/chat_templates/*.jinja -- chat_templates/generation_configs/*.json -labels: - model_name: Qwen/Qwen2-7B-Instruct - openllm_alias: 7b,7b-instruct - platforms: linux - source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat -models: [] -name: null -python: - extra_index_url: null - find_links: null - index_url: null - lock_packages: true - no_index: null - pack_git_packages: true - packages: null - pip_args: null - requirements_txt: ./requirements.txt - trusted_host: null - wheels: null -service: service:VLLM diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/service.py b/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/service.py deleted file mode 100644 index a6f4b728..00000000 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/service.py +++ /dev/null @@ -1,244 +0,0 @@ -import functools -import json -import logging -import os -import sys -import uuid -from typing import AsyncGenerator, Literal, Optional - -import bentoml -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml -from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML -from fastapi.responses import FileResponse -from typing_extensions import Annotated, Literal - - -class Message(pydantic.BaseModel): - content: str - role: Literal["system", "user", "assistant"] - - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) - -ENGINE_CONFIG = CONSTANTS["engine_config"] -SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -openai_api_app = fastapi.FastAPI() -static_app = fastapi.FastAPI() -ui_app = fastapi.FastAPI() - - -OPENAI_ENDPOINTS = [ - ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], - ["/completions", vllm_api_server.create_completion, ["POST"]], - ["/models", vllm_api_server.show_available_models, ["GET"]], -] - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -for route, endpoint, methods in OPENAI_ENDPOINTS: - openai_api_app.add_api_route( - path=route, - endpoint=endpoint, - methods=methods, - include_in_schema=True, - ) - - -STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") - -ui_app.mount( - "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" -) - - -@ui_app.get("/") -async def serve_chat_html(): - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -@ui_app.get("/{full_path:path}") -async def catch_all(full_path: str): - file_path = os.path.join(STATIC_DIR, full_path) - if os.path.exists(file_path): - return FileResponse(file_path) - return FileResponse(os.path.join(STATIC_DIR, "chat.html")) - - -# special handling for prometheus_client of bentoml -if "prometheus_client" in sys.modules: - sys.modules.pop("prometheus_client") - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.mount_asgi_app(ui_app, path="/chat") -@bentoml.service(**SERVICE_CONFIG) -class VLLM: - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs, AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) - self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) - self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) - logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") - - if OVERRIDE_CHAT_TEMPLATE: # use community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - chat_template = gen_config["template"] - else: - chat_template = None - - model_config = self.engine.engine.get_model_config() - - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - response_role="assistant", - chat_template=chat_template, - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - engine=self.engine, - served_model_names=[ENGINE_CONFIG["model"]], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - - @bentoml.api(route="/api/generate") - async def generate( - self, - prompt: str = "Explain superconductors like I'm five years old", - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - if stop is None: - stop = [] - - from vllm import SamplingParams - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) - - cursor = 0 - async for request_output in stream: - text = request_output.outputs[0].text - yield text[cursor:] - cursor = len(text) - - @bentoml.api(route="/api/chat") - async def chat( - self, - messages: list[Message] = [ - Message(content="what is the meaning of life?", role="user") - ], - model: str = ENGINE_CONFIG["model"], - max_tokens: Annotated[ - int, - Ge(128), - Le(ENGINE_CONFIG["max_model_len"]), - ] = ENGINE_CONFIG["max_model_len"], - stop: Optional[list[str]] = None, - ) -> AsyncGenerator[str, None]: - """ - light-weight chat API that takes in a list of messages and returns a response - """ - from vllm import SamplingParams - - try: - if OVERRIDE_CHAT_TEMPLATE: # community chat template - gen_config = _get_gen_config(CONSTANTS["chat_template"]) - if not stop: - if gen_config["stop_str"]: - stop = [gen_config["stop_str"]] - else: - stop = [] - system_prompt = gen_config["system_prompt"] - self.tokenizer.chat_template = gen_config["template"] - else: - if not stop: - if self.tokenizer.eos_token is not None: - stop = [self.tokenizer.eos_token] - else: - stop = [] - system_prompt = None - - SAMPLING_PARAM = SamplingParams( - max_tokens=max_tokens, - stop=stop, - ) - if system_prompt and messages[0].role != "system": - messages = [dict(role="system", content=system_prompt)] + messages - - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - - stream = await self.engine.add_request( - uuid.uuid4().hex, prompt, SAMPLING_PARAM - ) - - cursor = 0 - strip_flag = True - async for request_output in stream: - text = request_output.outputs[0].text - assistant_message = text[cursor:] - if not strip_flag: # strip the leading whitespace - yield assistant_message - elif assistant_message.strip(): - strip_flag = False - yield assistant_message.lstrip() - cursor = len(text) - except Exception as e: - logger.error(f"Error in chat API: {e}") - yield f"Error in chat API: {e}" - - -@functools.lru_cache(maxsize=1) -def _get_gen_config(community_chat_template: str) -> dict: - logger.info(f"Load community_chat_template: {community_chat_template}") - chat_template_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "chat_templates" - ) - config_path = os.path.join( - os.path.dirname(__file__), "chat_templates", "generation_configs" - ) - with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: - gen_config = json.load(f) - chat_template_file = gen_config["chat_template"].split("/")[-1] - with open(os.path.join(chat_template_path, chat_template_file)) as f: - chat_template = f.read() - gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") - return gen_config diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/README.md b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/README.md new file mode 100644 index 00000000..b653f3b8 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/README.md @@ -0,0 +1,16 @@ +# qwen2:7b-instruct-fp16-761c + +[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML) +[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/) +[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger) +[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML) +[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai) + +This is a Machine Learning Service created with BentoML. + +## Help + +* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML. +* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community. +* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests. +* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description). diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/apis/openapi.yaml b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/apis/openapi.yaml similarity index 97% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/apis/openapi.yaml rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/apis/openapi.yaml index d7d79ba0..16f4226d 100644 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/apis/openapi.yaml +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/apis/openapi.yaml @@ -9,6 +9,11 @@ components: content: anyOf: - type: string + - items: + anyOf: + - $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam' + type: array - type: 'null' title: Content function_call: @@ -18,6 +23,11 @@ components: name: title: Name type: string + refusal: + anyOf: + - type: string + - type: 'null' + title: Refusal role: const: assistant enum: @@ -48,6 +58,22 @@ components: - type title: ChatCompletionContentPartImageParam type: object + ChatCompletionContentPartRefusalParam: + properties: + refusal: + title: Refusal + type: string + type: + const: refusal + enum: + - refusal + title: Type + type: string + required: + - refusal + - type + title: ChatCompletionContentPartRefusalParam + type: object ChatCompletionContentPartTextParam: properties: text: @@ -414,8 +440,12 @@ components: ChatCompletionSystemMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string name: title: Name type: string @@ -433,8 +463,12 @@ components: ChatCompletionToolMessageParam: properties: content: + anyOf: + - type: string + - items: + $ref: '#/components/schemas/ChatCompletionContentPartTextParam' + type: array title: Content - type: string role: const: tool enum: @@ -1021,7 +1055,7 @@ info: contact: email: contact@bentoml.com name: BentoML Team - description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.0-informational)](https://pypi.org/project/BentoML)\n\ + description: "# qwen2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.1-informational)](https://pypi.org/project/BentoML)\n\ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\ diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/apis/schema.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/apis/schema.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/apis/schema.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/apis/schema.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/bento.yaml b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/bento.yaml similarity index 96% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/bento.yaml rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/bento.yaml index 8a757698..1167183b 100644 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/bento.yaml +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/bento.yaml @@ -1,8 +1,8 @@ service: service:VLLM name: qwen2 -version: 7b-instruct-fp16-0016 -bentoml_version: 1.3.0 -creation_time: '2024-07-24T08:20:26.097517+00:00' +version: 7b-instruct-fp16-761c +bentoml_version: 1.3.1 +creation_time: '2024-08-12T08:44:00.355219+00:00' labels: model_name: Qwen/Qwen2-7B-Instruct openllm_alias: 7b,7b-instruct diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/docker/Dockerfile b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/docker/Dockerfile new file mode 100644 index 00000000..9a323f70 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/docker/Dockerfile @@ -0,0 +1,70 @@ +# =========================================== +# +# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT +# +# =========================================== + +# Block SETUP_BENTO_BASE_IMAGE +FROM python:3.9-slim as base-container + +ENV LANG=C.UTF-8 + +ENV LC_ALL=C.UTF-8 + +ENV PYTHONIOENCODING=UTF-8 + +ENV PYTHONUNBUFFERED=1 + + + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache +RUN set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential curl +ENV UV_SYSTEM_PYTHON=1 +RUN curl -LO https://astral.sh/uv/install.sh && \ + sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/ + +# Block SETUP_BENTO_USER +ARG BENTO_USER=bentoml +ARG BENTO_USER_UID=1034 +ARG BENTO_USER_GID=1034 +RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER +ARG HF_TOKEN= +ENV HF_TOKEN=$HF_TOKEN +ARG BENTO_PATH=/home/bentoml/bento +ENV BENTO_PATH=$BENTO_PATH +ENV BENTOML_HOME=/home/bentoml/ + +RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R +WORKDIR $BENTO_PATH + + +# Block SETUP_BENTO_COMPONENTS + +RUN uv pip install torch==2.3.1 ; exit 0 +RUN uv pip install vllm==0.5.3.post1 ; exit 0 +COPY --chown=bentoml:bentoml ./env/python ./env/python/ +# install python packages with install.sh +RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh +COPY --chown=bentoml:bentoml . ./ + +# Block SETUP_BENTO_ENTRYPOINT +RUN rm -rf /var/lib/{apt,cache,log} +# Default port for BentoServer +EXPOSE 3000 + +# Expose Prometheus port +EXPOSE 3001 + +RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh + +USER bentoml + +ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ] + + diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/docker/entrypoint.sh b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/docker/entrypoint.sh similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/docker/entrypoint.sh rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/docker/entrypoint.sh diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/install.sh b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/install.sh new file mode 100644 index 00000000..53ba63cd --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -exuo pipefail + +# Parent directory https://stackoverflow.com/a/246128/8643197 +BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )" + +pip_install() { + if command -v "uv" > /dev/null 2>&1; then + uv pip install "$@" + else + pip3 install "$@" + fi +} + +PIP_ARGS=() + +# BentoML by default generates two requirement files: +# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build` +# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file +REQUIREMENTS_TXT="$BASEDIR/requirements.txt" +REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt" +WHEELS_DIR="$BASEDIR/wheels" +BENTOML_VERSION=${BENTOML_VERSION:-1.3.1} +# Install python packages, prefer installing the requirements.lock.txt file if it exist +pushd "$BASEDIR" &>/dev/null +if [ -f "$REQUIREMENTS_LOCK" ]; then + echo "Installing pip packages from 'requirements.lock.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK" +else + if [ -f "$REQUIREMENTS_TXT" ]; then + echo "Installing pip packages from 'requirements.txt'.." + pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT" + fi +fi +popd &>/dev/null + +# Attempt to expand the glob pattern. The nullglob option ensures that +# the pattern itself is not returned if no files match. +shopt -s nullglob +wheels=($WHEELS_DIR/*.whl) + +if [ ${#wheels[@]} -gt 0 ]; then + echo "Installing wheels packaged in Bento.." + pip_install "${PIP_ARGS[@]}" "${wheels[@]}" +fi + + +# Install the BentoML from PyPI if it's not already installed +if python3 -c "import bentoml" &> /dev/null; then + existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)") + if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then + echo "WARNING: using BentoML version ${existing_bentoml_version}" + fi +else + pip_install bentoml=="$BENTOML_VERSION" +fi \ No newline at end of file diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.lock.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.lock.txt new file mode 100644 index 00000000..92396f10 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.lock.txt @@ -0,0 +1,158 @@ +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.4.0 +appdirs==1.4.4 +asgiref==3.8.1 +async-timeout==4.0.3 +attrs==24.2.0 +bentoml==1.3.1 +cattrs==23.1.2 +certifi==2024.7.4 +charset-normalizer==3.3.2 +circus==0.18.0 +click==8.1.7 +click-option-group==0.5.6 +cloudpickle==3.0.0 +cmake==3.30.2 +datasets==2.14.4 +deepmerge==1.1.1 +deprecated==1.2.14 +dill==0.3.7 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +email-validator==2.2.0 +exceptiongroup==1.2.2 +fastapi==0.111.0 +fastapi-cli==0.0.5 +filelock==3.15.4 +frozenlist==1.4.1 +fs==2.4.16 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +httpx-ws==0.6.0 +huggingface-hub==0.24.5 +idna==3.7 +importlib-metadata==6.11.0 +inflection==0.5.1 +inquirerpy==0.3.4 +interegular==0.3.3 +jinja2==3.1.4 +jiter==0.5.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +lark==1.1.9 +llvmlite==0.43.0 +lm-format-enforcer==0.10.3 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.15 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +numba==0.60.0 +numpy==1.26.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==11.525.150 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +openai==1.40.3 +opentelemetry-api==1.20.0 +opentelemetry-instrumentation==0.41b0 +opentelemetry-instrumentation-aiohttp-client==0.41b0 +opentelemetry-instrumentation-asgi==0.41b0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +opentelemetry-util-http==0.41b0 +orjson==3.10.7 +outlines==0.0.46 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +pfzy==0.3.4 +pillow==10.4.0 +pip-requirements-parser==32.0.1 +prometheus-client==0.20.0 +prometheus-fastapi-instrumentator==7.0.0 +prompt-toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyairports==2.1.1 +pyarrow==17.0.0 +pycountry==24.6.1 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.2 +pyzmq==26.1.0 +ray==2.34.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +safetensors==0.4.4 +schema==0.7.7 +sentencepiece==0.2.0 +setuptools==72.1.0 +shellingham==1.5.4 +simple-di==0.1.5 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.2 +tiktoken==0.7.0 +tokenizers==0.19.1 +tomli==2.0.1 +tomli-w==1.0.0 +torch==2.3.1 +torchvision==0.18.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.43.1 +triton==2.3.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.2 +uv==0.2.35 +uvicorn==0.30.5 +uvloop==0.19.0 +vllm==0.5.3.post1 +vllm-flash-attn==2.5.9.post1 +watchfiles==0.23.0 +wcwidth==0.2.13 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +xformers==0.0.27 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.20.0 diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.txt new file mode 100644 index 00000000..84ab2103 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/requirements.txt @@ -0,0 +1,7 @@ +bentoml==1.3.1 +torch==2.3.1 +vllm==0.5.3.post1 +numpy==1.26.0 +transformers==4.43.1 +fastapi==0.111.0 +pyyaml diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/version.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/version.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/env/python/version.txt rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/env/python/version.txt diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/bentofile.yaml b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/bentofile.yaml new file mode 100644 index 00000000..568fecb2 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/bentofile.yaml @@ -0,0 +1,22 @@ +envs: +- name: HF_TOKEN +include: +- '*.py' +- '*.yaml' +- ui/* +- ui/chunks/* +- ui/css/* +- ui/media/* +- ui/chunks/pages/* +- bentovllm_openai/*.py +- chat_templates/chat_templates/*.jinja +- chat_templates/generation_configs/*.json +labels: + model_name: Qwen/Qwen2-7B-Instruct + openllm_alias: 7b,7b-instruct + platforms: linux + source: https://github.com/bentoml/openllm-models-feed/tree/main/src/vllm-chat +python: + lock_packages: true + requirements_txt: ./requirements.txt +service: service:VLLM diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/alpaca.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/alpaca.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/alpaca.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/alpaca.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/amberchat.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/amberchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/amberchat.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/amberchat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/chatml.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/chatml.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/chatml.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/chatml.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/chatqa.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/chatqa.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/chatqa.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/chatqa.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/falcon-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/falcon-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/falcon-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/falcon-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/gemma-it.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/gemma-it.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/gemma-it.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/gemma-it.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/llama-2-chat.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/llama-2-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/llama-2-chat.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/llama-2-chat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/llama-3-chat.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/llama-3-chat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/llama-3-chat.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/llama-3-chat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/mistral-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/mistral-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/mistral-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/mistral-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/openchat.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/openchat.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/openchat.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/openchat.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/phi-3.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/phi-3.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/phi-3.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/phi-3.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/saiga.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/saiga.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/saiga.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/saiga.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/solar-instruct.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/solar-instruct.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/solar-instruct.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/solar-instruct.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/vicuna.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/vicuna.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/vicuna.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/vicuna.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/zephyr.jinja b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/zephyr.jinja similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/chat_templates/zephyr.jinja rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/chat_templates/zephyr.jinja diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/alpaca.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/alpaca.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/alpaca.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/alpaca.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/amberchat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/amberchat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/amberchat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/amberchat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/chatqa.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/chatqa.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/chatqa.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/chatqa.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/gemma-it.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/gemma-it.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/gemma-it.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/gemma-it.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/llama-2-chat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/llama-2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/llama-2-chat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/llama-2-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/llama-3-chat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/llama-3-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/llama-3-chat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/llama-3-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/mistral-instruct.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/mistral-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/mistral-instruct.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/mistral-instruct.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/openchat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/openchat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/openchat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/openchat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/orca-2.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/orca-2.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/orca-2.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/orca-2.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/phi-3.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/phi-3.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/phi-3.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/phi-3.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/qwen2-chat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/qwen2-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/qwen2-chat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/qwen2-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/saiga.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/saiga.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/saiga.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/saiga.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/solar-instruct.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/solar-instruct.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/solar-instruct.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/solar-instruct.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/vicuna.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/vicuna.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/vicuna.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/vicuna.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/yi-chat.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/yi-chat.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/yi-chat.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/yi-chat.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/zephyr.json b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/zephyr.json similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/chat_templates/generation_configs/zephyr.json rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/chat_templates/generation_configs/zephyr.json diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bento_constants.py b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/openllm_config.yaml similarity index 91% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bento_constants.py rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/openllm_config.yaml index 1ed0f120..a397cf85 100644 --- a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/bento_constants.py +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/openllm_config.yaml @@ -1,5 +1,3 @@ - -CONSTANT_YAML = ''' engine_config: dtype: half max_model_len: 2048 @@ -15,5 +13,3 @@ gpu_type: nvidia-tesla-l4 traffic: timeout: 300 - -''' diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/service.py b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/service.py new file mode 100644 index 00000000..8fd0ca16 --- /dev/null +++ b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/service.py @@ -0,0 +1,245 @@ +import functools +import json +import logging +import os +import sys +import uuid +from typing import AsyncGenerator, Literal, Optional + +import bentoml +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge, Le +from fastapi.responses import FileResponse +from typing_extensions import Annotated, Literal + + +class Message(pydantic.BaseModel): + content: str + role: Literal["system", "user", "assistant"] + +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) + +ENGINE_CONFIG = CONSTANTS["engine_config"] +SERVICE_CONFIG = CONSTANTS["service_config"] +OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +openai_api_app = fastapi.FastAPI() +static_app = fastapi.FastAPI() +ui_app = fastapi.FastAPI() + + +OPENAI_ENDPOINTS = [ + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], + ["/completions", vllm_api_server.create_completion, ["POST"]], + ["/models", vllm_api_server.show_available_models, ["GET"]], +] + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +for route, endpoint, methods in OPENAI_ENDPOINTS: + openai_api_app.add_api_route( + path=route, + endpoint=endpoint, + methods=methods, + include_in_schema=True, + ) + + +STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui") + +ui_app.mount( + "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static" +) + + +@ui_app.get("/") +async def serve_chat_html(): + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +@ui_app.get("/{full_path:path}") +async def catch_all(full_path: str): + file_path = os.path.join(STATIC_DIR, full_path) + if os.path.exists(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(STATIC_DIR, "chat.html")) + + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.mount_asgi_app(ui_app, path="/chat") +@bentoml.service(**SERVICE_CONFIG) +class VLLM: + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs, AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG) + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) + self.tokenizer = AutoTokenizer.from_pretrained(ENGINE_CONFIG["model"]) + logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}") + + if OVERRIDE_CHAT_TEMPLATE: # use community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + chat_template = gen_config["template"] + else: + chat_template = None + + model_config = self.engine.engine.get_model_config() + + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + response_role="assistant", + chat_template=chat_template, + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + engine=self.engine, + served_model_names=[ENGINE_CONFIG["model"]], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + if stop is None: + stop = [] + + from vllm import SamplingParams + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) + + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + model: str = ENGINE_CONFIG["model"], + max_tokens: Annotated[ + int, + Ge(128), + Le(ENGINE_CONFIG["max_model_len"]), + ] = ENGINE_CONFIG["max_model_len"], + stop: Optional[list[str]] = None, + ) -> AsyncGenerator[str, None]: + """ + light-weight chat API that takes in a list of messages and returns a response + """ + from vllm import SamplingParams + + try: + if OVERRIDE_CHAT_TEMPLATE: # community chat template + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + else: + if not stop: + if self.tokenizer.eos_token is not None: + stop = [self.tokenizer.eos_token] + else: + stop = [] + system_prompt = None + + SAMPLING_PARAM = SamplingParams( + max_tokens=max_tokens, + stop=stop, + ) + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SAMPLING_PARAM + ) + + cursor = 0 + strip_flag = True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API: {e}") + yield f"Error in chat API: {e}" + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict: + logger.info(f"Load community_chat_template: {community_chat_template}") + chat_template_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "chat_templates" + ) + config_path = os.path.join( + os.path.dirname(__file__), "chat_templates", "generation_configs" + ) + with open(os.path.join(config_path, f"{community_chat_template}.json")) as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with open(os.path.join(chat_template_path, chat_template_file)) as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/404.html b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/404.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/404.html rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/404.html diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_buildManifest.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/UFSnOXBHq5ysU6-5BuENB/_ssgManifest.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/0e5ce63c-f5957df8d97fa48f.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/370b0802-87e84e603248538e.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/3d47b92a-f8bda4b39f1e2d9d.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/479ba886-0c92f49cb8e74e58.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/59650de3-87b10f0662b51900.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/66ec4792-34336521b476aa45.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/674-a1fcdac3696c5ed0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/69-bf2efb63b1299e3b.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/700-532b1fe2415e5859.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/700-532b1fe2415e5859.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/700-532b1fe2415e5859.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/899-fa939dd99dc7a1df.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/8e1d74a4-a6b9a2554f9153c0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/94730671-0f73873f7f5896de.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/995-34374f39bb210839.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/995-34374f39bb210839.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/995-34374f39bb210839.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/995-34374f39bb210839.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/(site)/page-5b6e14439f55739b.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/chat/page-9c8e223f40771eb6.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/layout-df2dea9dba0ceb06.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/app/not-found-c76dccfb8b88da53.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/d3ac728e-0c798b3b8aa3bf53.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/fd9d1056-32c33f3919735051.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/framework-00a8ba1a63cfdc9e.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/main-app-e95f89b5006af8a8.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/main-bf1416cb53f2b4c0.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/pages/_app-d21e88acd55d90f1.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/pages/_error-d6107f1aac0c574c.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/chunks/webpack-ee8b17d5a5297ccd.js diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/429544bd3cd8ce3a.css b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/429544bd3cd8ce3a.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/429544bd3cd8ce3a.css rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/429544bd3cd8ce3a.css diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/5b67f082b31cfc7b.css b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/5b67f082b31cfc7b.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/5b67f082b31cfc7b.css rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/5b67f082b31cfc7b.css diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/9e63023b20ddb15e.css b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/9e63023b20ddb15e.css similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/_next/static/css/9e63023b20ddb15e.css rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/_next/static/css/9e63023b20ddb15e.css diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/apple-touch-icon.png b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/apple-touch-icon.png similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/apple-touch-icon.png rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/apple-touch-icon.png diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/chat.html b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/chat.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/chat.html rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/chat.html diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/chat.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/chat.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/chat.txt rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/chat.txt diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/favicon-16x16.png b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/favicon-16x16.png similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/favicon-16x16.png rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/favicon-16x16.png diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/favicon.ico b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/favicon.ico similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/favicon.ico rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/favicon.ico diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/index.html b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/index.html similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/index.html rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/index.html diff --git a/bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/index.txt b/bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/index.txt similarity index 100% rename from bentoml/bentos/qwen2/7b-instruct-fp16-0016/src/ui/index.txt rename to bentoml/bentos/qwen2/7b-instruct-fp16-761c/src/ui/index.txt diff --git a/src/chattts/bento_constants.py b/src/chattts/openllm_config.yaml similarity index 80% rename from src/chattts/bento_constants.py rename to src/chattts/openllm_config.yaml index 6c20a8f9..421373c6 100644 --- a/src/chattts/bento_constants.py +++ b/src/chattts/openllm_config.yaml @@ -1,4 +1,3 @@ -CONSTANT_YAML = """ project: chattts service_config: name: chattts @@ -6,4 +5,3 @@ timeout: 300 resources: gpu: 1 -""" diff --git a/src/chattts/service.py b/src/chattts/service.py index 7c1b4efe..5b84e67c 100644 --- a/src/chattts/service.py +++ b/src/chattts/service.py @@ -6,11 +6,13 @@ import fastapi import fastapi.staticfiles import yaml -from bento_constants import CONSTANT_YAML from bentoml.validators import ContentType from fastapi.responses import FileResponse -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ROOT_DIR = os.path.join(os.path.dirname(__file__), "ui", "build") ui_app = fastapi.FastAPI() diff --git a/src/llamacpp-chat/bento_constants.py b/src/llamacpp-chat/bento_constants.py deleted file mode 100644 index 658ebbc0..00000000 --- a/src/llamacpp-chat/bento_constants.py +++ /dev/null @@ -1,13 +0,0 @@ -CONSTANT_YAML = """ - project: llamacpp-chat - service_config: - name: phi3 - traffic: - timeout: 300 - resources: - memory: 3Gi - engine_config: - model: microsoft/Phi-3-mini-4k-instruct-gguf - max_model_len: 2048 - chat_template: phi-3 -""" diff --git a/src/llamacpp-chat/bentofile.yaml b/src/llamacpp-chat/bentofile.yaml index 8bf9a4d6..ee9bedc9 100644 --- a/src/llamacpp-chat/bentofile.yaml +++ b/src/llamacpp-chat/bentofile.yaml @@ -4,6 +4,7 @@ labels: platforms: macos,linux include: - "*.py" +- "*.yaml" - "ui/*" python: requirements_txt: "./requirements.txt" diff --git a/src/llamacpp-chat/openllm_config.yaml b/src/llamacpp-chat/openllm_config.yaml new file mode 100644 index 00000000..f18bf631 --- /dev/null +++ b/src/llamacpp-chat/openllm_config.yaml @@ -0,0 +1,11 @@ +project: llamacpp-chat +service_config: + name: phi3 + traffic: + timeout: 300 + resources: + memory: 3Gi +engine_config: + model: microsoft/Phi-3-mini-4k-instruct-gguf + max_model_len: 2048 +chat_template: phi-3 diff --git a/src/llamacpp-chat/service.py b/src/llamacpp-chat/service.py index 7a9b5544..4ec07067 100644 --- a/src/llamacpp-chat/service.py +++ b/src/llamacpp-chat/service.py @@ -3,7 +3,6 @@ from typing_extensions import Annotated from llama_cpp import Llama from typing import AsyncGenerator, Optional -from bento_constants import CONSTANT_YAML import yaml import fastapi import fastapi.staticfiles @@ -20,7 +19,10 @@ Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.If you don't know the answer to a question, please don't share false information """ -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] diff --git a/src/make.py b/src/make.py index 5c665f00..42897c01 100644 --- a/src/make.py +++ b/src/make.py @@ -15,13 +15,6 @@ BENTOML_HOME = pathlib.Path(os.environ["BENTOML_HOME"]) -CONSTANT_YAML_TMPL = r""" -CONSTANT_YAML = ''' -{} -''' -""" - - def hash_file(file_path): hasher = hashlib.sha256() with open(file_path, "rb") as f: @@ -42,16 +35,14 @@ def hash_directory(directory_path): return hasher.hexdigest() -def ensure_venv(project, venv_dir): - req_hash = hash_file(project / "requirements.txt") - venv_path = venv_dir / req_hash[:7] - if not venv_path.exists(): +def ensure_venv(req_txt_file, venv_dir): + if not venv_dir.exists(): subprocess.run( [ sys.executable, "-m", "venv", - venv_path, + venv_dir, ], check=True, ) @@ -64,7 +55,7 @@ def ensure_venv(project, venv_dir): "install", "bentoml", "-p", - venv_path/"bin"/"python", + venv_dir/"bin"/"python", ], check=True, ) @@ -76,13 +67,13 @@ def ensure_venv(project, venv_dir): "pip", "install", "-r", - project / "requirements.txt", + req_txt_file, "-p", - venv_path/"bin"/"python", + venv_dir/"bin"/"python", ], check=True, ) - return venv_path + return venv_dir if __name__ == "__main__": @@ -106,8 +97,8 @@ def ensure_venv(project, venv_dir): tempdir = pathlib.Path(tempdir) shutil.copytree(project, tempdir, dirs_exist_ok=True) - with open(tempdir / "bento_constants.py", "w") as f: - f.write(CONSTANT_YAML_TMPL.format(yaml.dump(config))) + with open(tempdir / "openllm_config.yaml", "w") as f: + f.write(yaml.dump(config)) labels = config.get("extra_labels", {}) envs = config.get("extra_envs", []) @@ -136,7 +127,10 @@ def ensure_venv(project, venv_dir): ) continue - version_path = ensure_venv(tempdir, pathlib.Path(project).absolute() / "venv") + # prepare venv + req_txt_file = tempdir / "requirements.txt" + venv_dir = pathlib.Path("venv").absolute() / f"{project}-{hash_file(req_txt_file)[:7]}" + version_path = ensure_venv(req_txt_file, venv_dir) subprocess.run( [ diff --git a/src/recipe.yaml b/src/recipe.yaml index cb86018a..cfefd1b9 100644 --- a/src/recipe.yaml +++ b/src/recipe.yaml @@ -122,10 +122,10 @@ extra_labels: openllm_alias: 7b,7b-instruct model_name: mistralai/Mistral-7B-Instruct-v0.1 -"mistral:large-123b-instruct-fp16": +"mistral-large:123b-instruct-fp16": project: vllm-chat service_config: - name: mistral + name: mistral-large traffic: timeout: 300 resources: @@ -138,12 +138,12 @@ tensor_parallel_size: 4 chat_template: mistral-instruct extra_labels: - openllm_alias: large,large-instruct, 123b, 123b-instruct + openllm_alias: 123b, 123b-instruct-2407 model_name: mistralai/Mistral-Large-Instruct-2407 -"mistral:large-123b-instruct-awq-4bit": +"mistral-large:123b-instruct-awq-4bit": project: vllm-chat service_config: - name: mistral + name: mistral-large traffic: timeout: 300 resources: @@ -155,7 +155,7 @@ dtype: half chat_template: mistral-instruct extra_labels: - openllm_alias: large-4bit,large-instruct-4bit, 123b-4bit, 123b-instruct-4bit + openllm_alias: 123b-4bit,123b-instruct-2407-4bit model_name: casperhansen/mistral-large-instruct-2407-awq "llama3:8b-instruct-awq-4bit": project: vllm-chat diff --git a/src/vllm-chat/bento_constants.py b/src/vllm-chat/bento_constants.py deleted file mode 100644 index 1a0e6f7e..00000000 --- a/src/vllm-chat/bento_constants.py +++ /dev/null @@ -1,16 +0,0 @@ -CONSTANT_YAML = """ - alias: - - 7b-4bit - project: vllm-chat - service_config: - name: qwen2 - traffic: - timeout: 300 - resources: - gpu: 1 - gpu_type: nvidia-rtx-3060 - engine_config: - model: Qwen/Qwen2-7B-Instruct-AWQ - max_model_len: 2048 - quantization: awq -""" diff --git a/src/vllm-chat/bentofile.yaml b/src/vllm-chat/bentofile.yaml index 4cbd67fa..0f964f80 100644 --- a/src/vllm-chat/bentofile.yaml +++ b/src/vllm-chat/bentofile.yaml @@ -4,6 +4,7 @@ labels: platforms: linux include: - "*.py" +- "*.yaml" - "ui/*" - "ui/chunks/*" - "ui/css/*" diff --git a/src/vllm-chat/openllm_config.yaml b/src/vllm-chat/openllm_config.yaml new file mode 100644 index 00000000..987b9c39 --- /dev/null +++ b/src/vllm-chat/openllm_config.yaml @@ -0,0 +1,14 @@ +alias: + - 7b-4bit +project: vllm-chat +service_config: + name: qwen2 + traffic: + timeout: 300 + resources: + gpu: 1 + gpu_type: nvidia-rtx-3060 +engine_config: + model: Qwen/Qwen2-7B-Instruct-AWQ + max_model_len: 2048 + quantization: awq diff --git a/src/vllm-chat/service.py b/src/vllm-chat/service.py index a6f4b728..8fd0ca16 100644 --- a/src/vllm-chat/service.py +++ b/src/vllm-chat/service.py @@ -13,7 +13,6 @@ import vllm.entrypoints.openai.api_server as vllm_api_server import yaml from annotated_types import Ge, Le -from bento_constants import CONSTANT_YAML from fastapi.responses import FileResponse from typing_extensions import Annotated, Literal @@ -22,8 +21,10 @@ class Message(pydantic.BaseModel): content: str role: Literal["system", "user", "assistant"] - -CONSTANTS = yaml.safe_load(CONSTANT_YAML) +# Load the constants from the yaml file +CONSTANT_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml") +with open(CONSTANT_YAML) as f: + CONSTANTS = yaml.safe_load(f) ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"]