diff --git a/bentoml/bentos/llama3_2/11b-vision/README.md b/bentoml/bentos/llama3_2/11b-vision/README.md
new file mode 100644
index 00000000..992173e6
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/README.md
@@ -0,0 +1,16 @@
+# llama3_2:11b-vision
+
+[![pypi_status](https://img.shields.io/badge/BentoML-1.3.6-informational)](https://pypi.org/project/BentoML)
+[![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)
+[![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)
+[![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)
+[![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai)
+
+This is a Machine Learning Service created with BentoML.
+
+## Help
+
+* [📖 Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML.
+* [💬 Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML Slack community.
+* [🐛 GitHub Issues](https://github.com/bentoml/BentoML/issues): Report bugs and feature requests.
+* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description).
diff --git a/bentoml/bentos/llama3_2/11b-vision/apis/openapi.yaml b/bentoml/bentos/llama3_2/11b-vision/apis/openapi.yaml
new file mode 100644
index 00000000..a811296f
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/apis/openapi.yaml
@@ -0,0 +1,1344 @@
+components:
+ schemas:
+ AudioURL:
+ properties:
+ url:
+ title: Url
+ type: string
+ required:
+ - url
+ title: AudioURL
+ type: object
+ BaseModel:
+ properties: {}
+ title: BaseModel
+ type: object
+ ChatCompletionAssistantMessageParam:
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - items:
+ anyOf:
+ - $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam'
+ type: array
+ - type: 'null'
+ title: Content
+ function_call:
+ anyOf:
+ - $ref: '#/components/schemas/FunctionCall'
+ - type: 'null'
+ name:
+ title: Name
+ type: string
+ refusal:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Refusal
+ role:
+ const: assistant
+ enum:
+ - assistant
+ title: Role
+ type: string
+ tool_calls:
+ items:
+ $ref: '#/components/schemas/ChatCompletionMessageToolCallParam'
+ title: Tool Calls
+ type: array
+ required:
+ - role
+ title: ChatCompletionAssistantMessageParam
+ type: object
+ ChatCompletionContentPartAudioParam:
+ properties:
+ audio_url:
+ $ref: '#/components/schemas/AudioURL'
+ type:
+ const: audio_url
+ enum:
+ - audio_url
+ title: Type
+ type: string
+ required:
+ - audio_url
+ - type
+ title: ChatCompletionContentPartAudioParam
+ type: object
+ ChatCompletionContentPartImageParam:
+ properties:
+ image_url:
+ $ref: '#/components/schemas/ImageURL'
+ type:
+ const: image_url
+ enum:
+ - image_url
+ title: Type
+ type: string
+ required:
+ - image_url
+ - type
+ title: ChatCompletionContentPartImageParam
+ type: object
+ ChatCompletionContentPartRefusalParam:
+ properties:
+ refusal:
+ title: Refusal
+ type: string
+ type:
+ const: refusal
+ enum:
+ - refusal
+ title: Type
+ type: string
+ required:
+ - refusal
+ - type
+ title: ChatCompletionContentPartRefusalParam
+ type: object
+ ChatCompletionContentPartTextParam:
+ properties:
+ text:
+ title: Text
+ type: string
+ type:
+ const: text
+ enum:
+ - text
+ title: Type
+ type: string
+ required:
+ - text
+ - type
+ title: ChatCompletionContentPartTextParam
+ type: object
+ ChatCompletionFunctionMessageParam:
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Content
+ name:
+ title: Name
+ type: string
+ role:
+ const: function
+ enum:
+ - function
+ title: Role
+ type: string
+ required:
+ - content
+ - name
+ - role
+ title: ChatCompletionFunctionMessageParam
+ type: object
+ ChatCompletionMessageToolCallParam:
+ properties:
+ function:
+ $ref: '#/components/schemas/Function'
+ id:
+ title: Id
+ type: string
+ type:
+ const: function
+ enum:
+ - function
+ title: Type
+ type: string
+ required:
+ - id
+ - function
+ - type
+ title: ChatCompletionMessageToolCallParam
+ type: object
+ ChatCompletionNamedFunction:
+ additionalProperties: false
+ properties:
+ name:
+ title: Name
+ type: string
+ required:
+ - name
+ title: ChatCompletionNamedFunction
+ type: object
+ ChatCompletionNamedToolChoiceParam:
+ additionalProperties: false
+ properties:
+ function:
+ $ref: '#/components/schemas/ChatCompletionNamedFunction'
+ type:
+ const: function
+ default: function
+ enum:
+ - function
+ title: Type
+ type: string
+ required:
+ - function
+ title: ChatCompletionNamedToolChoiceParam
+ type: object
+ ChatCompletionRequest:
+ additionalProperties: false
+ properties:
+ add_generation_prompt:
+ default: true
+ description: If true, the generation prompt will be added to the chat template.
+ This is a parameter used by chat template in tokenizer config of the model.
+ title: Add Generation Prompt
+ type: boolean
+ add_special_tokens:
+ default: false
+ description: If true, special tokens (e.g. BOS) will be added to the prompt
+ on top of what is added by the chat template. For most models, the chat
+ template takes care of adding the special tokens so this should be set
+ to false (as is the default).
+ title: Add Special Tokens
+ type: boolean
+ best_of:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Best Of
+ chat_template:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: A Jinja template to use for this conversion. As of transformers
+ v4.44, default chat template is no longer allowed, so you must provide
+ a chat template if the tokenizer does not define one.
+ title: Chat Template
+ chat_template_kwargs:
+ anyOf:
+ - type: object
+ - type: 'null'
+ description: Additional kwargs to pass to the template renderer. Will be
+ accessible by the chat template.
+ title: Chat Template Kwargs
+ documents:
+ anyOf:
+ - items:
+ additionalProperties:
+ type: string
+ type: object
+ type: array
+ - type: 'null'
+ description: A list of dicts representing documents that will be accessible
+ to the model if it is performing RAG (retrieval-augmented generation).
+ If the template does not support RAG, this argument will have no effect.
+ We recommend that each document should be a dict containing "title" and
+ "text" keys.
+ title: Documents
+ early_stopping:
+ default: false
+ title: Early Stopping
+ type: boolean
+ echo:
+ default: false
+ description: If true, the new message will be prepended with the last message
+ if they belong to the same role.
+ title: Echo
+ type: boolean
+ frequency_penalty:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 0.0
+ title: Frequency Penalty
+ guided_choice:
+ anyOf:
+ - items:
+ type: string
+ type: array
+ - type: 'null'
+ description: If specified, the output will be exactly one of the choices.
+ title: Guided Choice
+ guided_decoding_backend:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, will override the default guided decoding backend
+ of the server for this specific request. If set, must be either 'outlines'
+ / 'lm-format-enforcer'
+ title: Guided Decoding Backend
+ guided_grammar:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, the output will follow the context free grammar.
+ title: Guided Grammar
+ guided_json:
+ anyOf:
+ - type: string
+ - type: object
+ - $ref: '#/components/schemas/BaseModel'
+ - type: 'null'
+ description: If specified, the output will follow the JSON schema.
+ title: Guided Json
+ guided_regex:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, the output will follow the regex pattern.
+ title: Guided Regex
+ guided_whitespace_pattern:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, will override the default whitespace pattern
+ for guided json decoding.
+ title: Guided Whitespace Pattern
+ ignore_eos:
+ default: false
+ title: Ignore Eos
+ type: boolean
+ include_stop_str_in_output:
+ default: false
+ title: Include Stop Str In Output
+ type: boolean
+ length_penalty:
+ default: 1.0
+ title: Length Penalty
+ type: number
+ logit_bias:
+ anyOf:
+ - additionalProperties:
+ type: number
+ type: object
+ - type: 'null'
+ title: Logit Bias
+ logprobs:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: false
+ title: Logprobs
+ max_tokens:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Max Tokens
+ messages:
+ items:
+ anyOf:
+ - $ref: '#/components/schemas/ChatCompletionSystemMessageParam'
+ - $ref: '#/components/schemas/ChatCompletionUserMessageParam'
+ - $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
+ - $ref: '#/components/schemas/ChatCompletionToolMessageParam'
+ - $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
+ - $ref: '#/components/schemas/CustomChatCompletionMessageParam'
+ title: Messages
+ type: array
+ min_p:
+ default: 0.0
+ title: Min P
+ type: number
+ min_tokens:
+ default: 0
+ title: Min Tokens
+ type: integer
+ model:
+ title: Model
+ type: string
+ n:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ default: 1
+ title: N
+ parallel_tool_calls:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: false
+ title: Parallel Tool Calls
+ presence_penalty:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 0.0
+ title: Presence Penalty
+ prompt_logprobs:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Prompt Logprobs
+ repetition_penalty:
+ default: 1.0
+ title: Repetition Penalty
+ type: number
+ response_format:
+ anyOf:
+ - $ref: '#/components/schemas/ResponseFormat'
+ - type: 'null'
+ seed:
+ anyOf:
+ - maximum: 9.223372036854776e+18
+ minimum: -9.223372036854776e+18
+ type: integer
+ - type: 'null'
+ title: Seed
+ skip_special_tokens:
+ default: true
+ title: Skip Special Tokens
+ type: boolean
+ spaces_between_special_tokens:
+ default: true
+ title: Spaces Between Special Tokens
+ type: boolean
+ stop:
+ anyOf:
+ - type: string
+ - items:
+ type: string
+ type: array
+ - type: 'null'
+ title: Stop
+ stop_token_ids:
+ anyOf:
+ - items:
+ type: integer
+ type: array
+ - type: 'null'
+ title: Stop Token Ids
+ stream:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: false
+ title: Stream
+ stream_options:
+ anyOf:
+ - $ref: '#/components/schemas/StreamOptions'
+ - type: 'null'
+ temperature:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 0.7
+ title: Temperature
+ tool_choice:
+ anyOf:
+ - const: none
+ enum:
+ - none
+ type: string
+ - const: auto
+ enum:
+ - auto
+ type: string
+ - $ref: '#/components/schemas/ChatCompletionNamedToolChoiceParam'
+ - type: 'null'
+ default: none
+ title: Tool Choice
+ tools:
+ anyOf:
+ - items:
+ $ref: '#/components/schemas/ChatCompletionToolsParam'
+ type: array
+ - type: 'null'
+ title: Tools
+ top_k:
+ default: -1
+ title: Top K
+ type: integer
+ top_logprobs:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ default: 0
+ title: Top Logprobs
+ top_p:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 1.0
+ title: Top P
+ truncate_prompt_tokens:
+ anyOf:
+ - minimum: 1.0
+ type: integer
+ - type: 'null'
+ title: Truncate Prompt Tokens
+ use_beam_search:
+ default: false
+ title: Use Beam Search
+ type: boolean
+ user:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: User
+ required:
+ - messages
+ - model
+ title: ChatCompletionRequest
+ type: object
+ ChatCompletionSystemMessageParam:
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - items:
+ $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+ type: array
+ title: Content
+ name:
+ title: Name
+ type: string
+ role:
+ const: system
+ enum:
+ - system
+ title: Role
+ type: string
+ required:
+ - content
+ - role
+ title: ChatCompletionSystemMessageParam
+ type: object
+ ChatCompletionToolMessageParam:
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - items:
+ $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+ type: array
+ title: Content
+ role:
+ const: tool
+ enum:
+ - tool
+ title: Role
+ type: string
+ tool_call_id:
+ title: Tool Call Id
+ type: string
+ required:
+ - content
+ - role
+ - tool_call_id
+ title: ChatCompletionToolMessageParam
+ type: object
+ ChatCompletionToolsParam:
+ additionalProperties: false
+ properties:
+ function:
+ $ref: '#/components/schemas/FunctionDefinition'
+ type:
+ const: function
+ default: function
+ enum:
+ - function
+ title: Type
+ type: string
+ required:
+ - function
+ title: ChatCompletionToolsParam
+ type: object
+ ChatCompletionUserMessageParam:
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - items:
+ anyOf:
+ - $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
+ type: array
+ title: Content
+ name:
+ title: Name
+ type: string
+ role:
+ const: user
+ enum:
+ - user
+ title: Role
+ type: string
+ required:
+ - content
+ - role
+ title: ChatCompletionUserMessageParam
+ type: object
+ CompletionRequest:
+ additionalProperties: false
+ properties:
+ add_special_tokens:
+ default: true
+ description: If true (the default), special tokens (e.g. BOS) will be added
+ to the prompt.
+ title: Add Special Tokens
+ type: boolean
+ allowed_token_ids:
+ anyOf:
+ - items:
+ type: integer
+ type: array
+ - type: 'null'
+ title: Allowed Token Ids
+ best_of:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Best Of
+ early_stopping:
+ default: false
+ title: Early Stopping
+ type: boolean
+ echo:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: false
+ title: Echo
+ frequency_penalty:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 0.0
+ title: Frequency Penalty
+ guided_choice:
+ anyOf:
+ - items:
+ type: string
+ type: array
+ - type: 'null'
+ description: If specified, the output will be exactly one of the choices.
+ title: Guided Choice
+ guided_decoding_backend:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, will override the default guided decoding backend
+ of the server for this specific request. If set, must be one of 'outlines'
+ / 'lm-format-enforcer'
+ title: Guided Decoding Backend
+ guided_grammar:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, the output will follow the context free grammar.
+ title: Guided Grammar
+ guided_json:
+ anyOf:
+ - type: string
+ - type: object
+ - $ref: '#/components/schemas/BaseModel'
+ - type: 'null'
+ description: If specified, the output will follow the JSON schema.
+ title: Guided Json
+ guided_regex:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, the output will follow the regex pattern.
+ title: Guided Regex
+ guided_whitespace_pattern:
+ anyOf:
+ - type: string
+ - type: 'null'
+ description: If specified, will override the default whitespace pattern
+ for guided json decoding.
+ title: Guided Whitespace Pattern
+ ignore_eos:
+ default: false
+ title: Ignore Eos
+ type: boolean
+ include_stop_str_in_output:
+ default: false
+ title: Include Stop Str In Output
+ type: boolean
+ length_penalty:
+ default: 1.0
+ title: Length Penalty
+ type: number
+ logit_bias:
+ anyOf:
+ - additionalProperties:
+ type: number
+ type: object
+ - type: 'null'
+ title: Logit Bias
+ logprobs:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Logprobs
+ max_tokens:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ default: 16
+ title: Max Tokens
+ min_p:
+ default: 0.0
+ title: Min P
+ type: number
+ min_tokens:
+ default: 0
+ title: Min Tokens
+ type: integer
+ model:
+ title: Model
+ type: string
+ n:
+ default: 1
+ title: N
+ type: integer
+ presence_penalty:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 0.0
+ title: Presence Penalty
+ prompt:
+ anyOf:
+ - items:
+ type: integer
+ type: array
+ - items:
+ items:
+ type: integer
+ type: array
+ type: array
+ - type: string
+ - items:
+ type: string
+ type: array
+ title: Prompt
+ prompt_logprobs:
+ anyOf:
+ - type: integer
+ - type: 'null'
+ title: Prompt Logprobs
+ repetition_penalty:
+ default: 1.0
+ title: Repetition Penalty
+ type: number
+ response_format:
+ anyOf:
+ - $ref: '#/components/schemas/ResponseFormat'
+ - type: 'null'
+ description: 'Similar to chat completion, this parameter specifies the format
+ of output. Only {''type'': ''json_object''} or {''type'': ''text'' } is
+ supported.'
+ seed:
+ anyOf:
+ - maximum: 9.223372036854776e+18
+ minimum: -9.223372036854776e+18
+ type: integer
+ - type: 'null'
+ title: Seed
+ skip_special_tokens:
+ default: true
+ title: Skip Special Tokens
+ type: boolean
+ spaces_between_special_tokens:
+ default: true
+ title: Spaces Between Special Tokens
+ type: boolean
+ stop:
+ anyOf:
+ - type: string
+ - items:
+ type: string
+ type: array
+ - type: 'null'
+ title: Stop
+ stop_token_ids:
+ anyOf:
+ - items:
+ type: integer
+ type: array
+ - type: 'null'
+ title: Stop Token Ids
+ stream:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: false
+ title: Stream
+ stream_options:
+ anyOf:
+ - $ref: '#/components/schemas/StreamOptions'
+ - type: 'null'
+ suffix:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Suffix
+ temperature:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 1.0
+ title: Temperature
+ top_k:
+ default: -1
+ title: Top K
+ type: integer
+ top_p:
+ anyOf:
+ - type: number
+ - type: 'null'
+ default: 1.0
+ title: Top P
+ truncate_prompt_tokens:
+ anyOf:
+ - minimum: 1.0
+ type: integer
+ - type: 'null'
+ title: Truncate Prompt Tokens
+ use_beam_search:
+ default: false
+ title: Use Beam Search
+ type: boolean
+ user:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: User
+ required:
+ - model
+ - prompt
+ title: CompletionRequest
+ type: object
+ CustomChatCompletionContentPartParam:
+ additionalProperties: true
+ properties:
+ type:
+ title: Type
+ type: string
+ required:
+ - type
+ title: CustomChatCompletionContentPartParam
+ type: object
+ CustomChatCompletionMessageParam:
+ description: Enables custom roles in the Chat Completion API.
+ properties:
+ content:
+ anyOf:
+ - type: string
+ - items:
+ anyOf:
+ - $ref: '#/components/schemas/ChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/ChatCompletionContentPartImageParam'
+ - $ref: '#/components/schemas/ChatCompletionContentPartAudioParam'
+ - $ref: '#/components/schemas/ChatCompletionContentPartRefusalParam'
+ - $ref: '#/components/schemas/CustomChatCompletionContentPartParam'
+ type: array
+ title: Content
+ name:
+ title: Name
+ type: string
+ role:
+ title: Role
+ type: string
+ tool_call_id:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Tool Call Id
+ tool_calls:
+ anyOf:
+ - items:
+ $ref: '#/components/schemas/ChatCompletionMessageToolCallParam'
+ type: array
+ - type: 'null'
+ title: Tool Calls
+ required:
+ - role
+ title: CustomChatCompletionMessageParam
+ type: object
+ Function:
+ properties:
+ arguments:
+ title: Arguments
+ type: string
+ name:
+ title: Name
+ type: string
+ required:
+ - arguments
+ - name
+ title: Function
+ type: object
+ FunctionCall:
+ properties:
+ arguments:
+ title: Arguments
+ type: string
+ name:
+ title: Name
+ type: string
+ required:
+ - arguments
+ - name
+ title: FunctionCall
+ type: object
+ FunctionDefinition:
+ additionalProperties: false
+ properties:
+ description:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Description
+ name:
+ title: Name
+ type: string
+ parameters:
+ anyOf:
+ - type: object
+ - type: 'null'
+ title: Parameters
+ required:
+ - name
+ title: FunctionDefinition
+ type: object
+ HTTPValidationError:
+ properties:
+ detail:
+ items:
+ $ref: '#/components/schemas/ValidationError'
+ title: Detail
+ type: array
+ title: HTTPValidationError
+ type: object
+ ImageURL:
+ properties:
+ detail:
+ enum:
+ - auto
+ - low
+ - high
+ title: Detail
+ type: string
+ url:
+ title: Url
+ type: string
+ required:
+ - url
+ title: ImageURL
+ type: object
+ InternalServerError:
+ description: Internal Server Error
+ properties:
+ detail:
+ title: Error Detail
+ type: string
+ error:
+ title: Message
+ type: string
+ required:
+ - error
+ title: InternalServerError
+ type: object
+ InvalidArgument:
+ description: Bad Request
+ properties:
+ detail:
+ title: Error Detail
+ type: string
+ error:
+ title: Message
+ type: string
+ required:
+ - error
+ title: InvalidArgument
+ type: object
+ JsonSchemaResponseFormat:
+ additionalProperties: false
+ properties:
+ description:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Description
+ name:
+ title: Name
+ type: string
+ schema:
+ anyOf:
+ - type: object
+ - type: 'null'
+ title: Schema
+ strict:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ title: Strict
+ required:
+ - name
+ title: JsonSchemaResponseFormat
+ type: object
+ NotFound:
+ description: Not Found
+ properties:
+ detail:
+ title: Error Detail
+ type: string
+ error:
+ title: Message
+ type: string
+ required:
+ - error
+ title: NotFound
+ type: object
+ ResponseFormat:
+ additionalProperties: false
+ properties:
+ json_schema:
+ anyOf:
+ - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+ - type: 'null'
+ type:
+ enum:
+ - text
+ - json_object
+ - json_schema
+ title: Type
+ type: string
+ required:
+ - type
+ title: ResponseFormat
+ type: object
+ StreamOptions:
+ additionalProperties: false
+ properties:
+ continuous_usage_stats:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: true
+ title: Continuous Usage Stats
+ include_usage:
+ anyOf:
+ - type: boolean
+ - type: 'null'
+ default: true
+ title: Include Usage
+ title: StreamOptions
+ type: object
+ TaskStatusResponse:
+ properties:
+ created_at:
+ title: Created At
+ type: string
+ executed_at:
+ anyOf:
+ - type: string
+ - type: 'null'
+ title: Executed At
+ status:
+ enum:
+ - in_progress
+ - success
+ - failure
+ - cancelled
+ title: Status
+ type: string
+ task_id:
+ title: Task Id
+ type: string
+ required:
+ - task_id
+ - status
+ - created_at
+ - executed_at
+ title: TaskStatusResponse
+ type: object
+ ValidationError:
+ properties:
+ loc:
+ items:
+ anyOf:
+ - type: string
+ - type: integer
+ title: Location
+ type: array
+ msg:
+ title: Message
+ type: string
+ type:
+ title: Error Type
+ type: string
+ required:
+ - loc
+ - msg
+ - type
+ title: ValidationError
+ type: object
+ generate__Input:
+ properties:
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ title: generate__Input
+ type: object
+ generate_with_image__Input:
+ properties:
+ image:
+ anyOf:
+ - format: image
+ type: file
+ - type: 'null'
+ default: null
+ title: Image
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ title: generate_with_image__Input
+ type: object
+info:
+ contact:
+ email: contact@bentoml.com
+ name: BentoML Team
+ description: "# llama3_2:dev\n\n[![pypi_status](https://img.shields.io/badge/BentoML-1.3.6-informational)](https://pypi.org/project/BentoML)\n\
+ [![documentation_status](https://readthedocs.org/projects/bentoml/badge/?version=latest)](https://docs.bentoml.com/)\n\
+ [![join_slack](https://badgen.net/badge/Join/BentoML%20Slack/cyan?icon=slack)](https://l.bentoml.com/join-slack-swagger)\n\
+ [![BentoML GitHub Repo](https://img.shields.io/github/stars/bentoml/bentoml?style=social)](https://github.com/bentoml/BentoML)\n\
+ [![Twitter Follow](https://img.shields.io/twitter/follow/bentomlai?label=Follow%20BentoML&style=social)](https://twitter.com/bentomlai)\n\
+ \nThis is a Machine Learning Service created with BentoML.\n\n## Help\n\n* [\U0001F4D6\
+ \ Documentation](https://docs.bentoml.com/en/latest/): Learn how to use BentoML.\n\
+ * [\U0001F4AC Community](https://l.bentoml.com/join-slack-swagger): Join the BentoML\
+ \ Slack community.\n* [\U0001F41B GitHub Issues](https://github.com/bentoml/BentoML/issues):\
+ \ Report bugs and feature requests.\n* Tip: you can also [customize this README](https://docs.bentoml.com/en/latest/concepts/bento.html#description).\n"
+ title: llama3_2
+ version: None
+openapi: 3.0.2
+paths:
+ /chat/:
+ get:
+ operationId: serve_chat_html__get
+ responses:
+ '200':
+ content:
+ application/json:
+ schema: {}
+ description: Successful Response
+ summary: Serve Chat Html
+ /chat/{full_path}:
+ get:
+ operationId: catch_all__full_path__get
+ parameters:
+ - in: path
+ name: full_path
+ required: true
+ schema:
+ title: Full Path
+ type: string
+ responses:
+ '200':
+ content:
+ application/json:
+ schema: {}
+ description: Successful Response
+ '422':
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/HTTPValidationError'
+ description: Validation Error
+ summary: Catch All
+ /generate:
+ post:
+ description: ''
+ operationId: llama3_2__generate
+ requestBody:
+ content:
+ application/json:
+ schema:
+ properties:
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ title: Input
+ type: object
+ responses:
+ 200:
+ content:
+ text/event-stream:
+ schema:
+ title: IORootModel[str]
+ type: string
+ description: Successful Response
+ 400:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/InvalidArgument'
+ description: Bad Request
+ 404:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/NotFound'
+ description: Not Found
+ 500:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/InternalServerError'
+ description: Internal Server Error
+ tags:
+ - Service APIs
+ x-bentoml-name: generate
+ /generate_with_image:
+ post:
+ description: ''
+ operationId: llama3_2__generate_with_image
+ requestBody:
+ content:
+ multipart/form-data:
+ schema:
+ properties:
+ image:
+ default: null
+ format: binary
+ title: Image
+ type: string
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ title: Input
+ type: object
+ responses:
+ 200:
+ content:
+ text/event-stream:
+ schema:
+ title: IORootModel[str]
+ type: string
+ description: Successful Response
+ 400:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/InvalidArgument'
+ description: Bad Request
+ 404:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/NotFound'
+ description: Not Found
+ 500:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/InternalServerError'
+ description: Internal Server Error
+ tags:
+ - Service APIs
+ x-bentoml-name: generate_with_image
+ /healthz:
+ get:
+ description: Health check endpoint. Expecting an empty response with status
+ code 200
when the service is in health state. The /healthz
+ endpoint is deprecated. (since Kubernetes v1.16)
+ responses:
+ '200':
+ description: Successful Response
+ tags:
+ - Infrastructure
+ /livez:
+ get:
+ description: Health check endpoint for Kubernetes. Healthy endpoint responses
+ with a 200
OK status.
+ responses:
+ '200':
+ description: Successful Response
+ tags:
+ - Infrastructure
+ /metrics:
+ get:
+ description: Prometheus metrics endpoint. The /metrics
responses
+ with a 200
. The output can then be used by a Prometheus sidecar
+ to scrape the metrics of the service.
+ responses:
+ '200':
+ description: Successful Response
+ tags:
+ - Infrastructure
+ /readyz:
+ get:
+ description: A 200
OK status from /readyz
endpoint
+ indicated the service is ready to accept traffic. From that point and onward,
+ Kubernetes will use /livez
endpoint to perform periodic health
+ checks.
+ responses:
+ '200':
+ description: Successful Response
+ tags:
+ - Infrastructure
+ /v1/chat/completions:
+ post:
+ operationId: create_chat_completion_chat_completions_post
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/json:
+ schema: {}
+ description: Successful Response
+ '422':
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/HTTPValidationError'
+ description: Validation Error
+ summary: Create Chat Completion
+ /v1/completions:
+ post:
+ operationId: create_completion_completions_post
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/CompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/json:
+ schema: {}
+ description: Successful Response
+ '422':
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/HTTPValidationError'
+ description: Validation Error
+ summary: Create Completion
+ /v1/models:
+ get:
+ operationId: show_available_models_models_get
+ responses:
+ '200':
+ content:
+ application/json:
+ schema: {}
+ description: Successful Response
+ summary: Show Available Models
+servers:
+- url: .
+tags:
+- description: BentoML Service API endpoints for inference.
+ name: Service APIs
+- description: Common infrastructure endpoints for observability.
+ name: Infrastructure
diff --git a/bentoml/bentos/llama3_2/11b-vision/apis/schema.json b/bentoml/bentos/llama3_2/11b-vision/apis/schema.json
new file mode 100644
index 00000000..5f089377
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/apis/schema.json
@@ -0,0 +1,58 @@
+{
+ "name": "llama3_2",
+ "type": "service",
+ "routes": [
+ {
+ "name": "generate",
+ "route": "/generate",
+ "batchable": false,
+ "input": {
+ "properties": {
+ "prompt": {
+ "default": "what is this?",
+ "title": "Prompt",
+ "type": "string"
+ }
+ },
+ "title": "Input",
+ "type": "object"
+ },
+ "output": {
+ "title": "IORootModel[str]",
+ "type": "string",
+ "is_stream": true,
+ "media_type": "text/event-stream"
+ },
+ "is_task": false
+ },
+ {
+ "name": "generate_with_image",
+ "route": "/generate_with_image",
+ "batchable": false,
+ "input": {
+ "properties": {
+ "prompt": {
+ "default": "what is this?",
+ "title": "Prompt",
+ "type": "string"
+ },
+ "image": {
+ "default": null,
+ "title": "Image",
+ "format": "image",
+ "type": "file"
+ }
+ },
+ "title": "Input",
+ "type": "object"
+ },
+ "output": {
+ "title": "IORootModel[str]",
+ "type": "string",
+ "is_stream": true,
+ "media_type": "text/event-stream"
+ },
+ "is_task": false
+ }
+ ]
+}
\ No newline at end of file
diff --git a/bentoml/bentos/llama3_2/11b-vision/bento.yaml b/bentoml/bentos/llama3_2/11b-vision/bento.yaml
new file mode 100644
index 00000000..859ae9b2
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/bento.yaml
@@ -0,0 +1,94 @@
+service: service:VLLM
+name: llama3_2
+version: 11b-vision
+bentoml_version: 1.3.6
+creation_time: '2024-09-26T08:21:20.985912+00:00'
+labels:
+ source: https://github.com/bentoml/openllm-models/tree/main/src/vllm-chat
+ platforms: linux
+models: []
+runners: []
+entry_service: llama3_2
+services:
+- name: llama3_2
+ service: ''
+ models: []
+ dependencies: []
+ config:
+ name: llama3_2
+ traffic:
+ timeout: 300
+ resources:
+ gpu: 1
+ gpu_type: nvidia-a100-80gb
+envs: []
+schema:
+ name: llama3_2
+ type: service
+ routes:
+ - name: generate
+ route: /generate
+ batchable: false
+ input:
+ properties:
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ title: Input
+ type: object
+ output:
+ title: IORootModel[str]
+ type: string
+ is_stream: true
+ media_type: text/event-stream
+ is_task: false
+ - name: generate_with_image
+ route: /generate_with_image
+ batchable: false
+ input:
+ properties:
+ prompt:
+ default: what is this?
+ title: Prompt
+ type: string
+ image:
+ default: null
+ title: Image
+ format: image
+ type: file
+ title: Input
+ type: object
+ output:
+ title: IORootModel[str]
+ type: string
+ is_stream: true
+ media_type: text/event-stream
+ is_task: false
+apis: []
+docker:
+ distro: debian
+ python_version: '3.9'
+ cuda_version: null
+ env: null
+ system_packages: null
+ setup_script: null
+ base_image: null
+ dockerfile_template: null
+python:
+ requirements_txt: ./requirements.txt
+ packages: null
+ lock_packages: true
+ pack_git_packages: true
+ index_url: null
+ no_index: null
+ trusted_host: null
+ find_links: null
+ extra_index_url: null
+ pip_args: null
+ wheels: null
+conda:
+ environment_yml: null
+ channels: null
+ dependencies: null
+ pip: null
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/docker/Dockerfile b/bentoml/bentos/llama3_2/11b-vision/env/docker/Dockerfile
new file mode 100644
index 00000000..6a314dec
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/docker/Dockerfile
@@ -0,0 +1,69 @@
+# ===========================================
+#
+# THIS IS A GENERATED DOCKERFILE. DO NOT EDIT
+#
+# ===========================================
+
+# Block SETUP_BENTO_BASE_IMAGE
+FROM python:3.9-slim as base-container
+
+ENV LANG=C.UTF-8
+
+ENV LC_ALL=C.UTF-8
+
+ENV PYTHONIOENCODING=UTF-8
+
+ENV PYTHONUNBUFFERED=1
+
+
+
+USER root
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
+RUN set -eux && \
+ apt-get update -y && \
+ apt-get install -q -y --no-install-recommends --allow-remove-essential \
+ ca-certificates gnupg2 bash build-essential curl
+ENV UV_SYSTEM_PYTHON=1
+RUN curl -LO https://astral.sh/uv/install.sh && \
+ sh install.sh && rm install.sh && mv $HOME/.cargo/bin/uv /usr/local/bin/
+
+# Block SETUP_BENTO_USER
+ARG BENTO_USER=bentoml
+ARG BENTO_USER_UID=1034
+ARG BENTO_USER_GID=1034
+RUN groupadd -g $BENTO_USER_GID -o $BENTO_USER && useradd -m -u $BENTO_USER_UID -g $BENTO_USER_GID -o -r $BENTO_USER
+ARG BENTO_PATH=/home/bentoml/bento
+ENV BENTO_PATH=$BENTO_PATH
+ENV BENTOML_HOME=/home/bentoml/
+ENV BENTOML_HF_CACHE_DIR=/home/bentoml/bento/hf-models
+
+RUN mkdir $BENTO_PATH && chown bentoml:bentoml $BENTO_PATH -R
+WORKDIR $BENTO_PATH
+
+
+# Block SETUP_BENTO_COMPONENTS
+
+RUN uv pip install torch==2.4.0 ; exit 0
+RUN uv pip install vllm==0.6.2 ; exit 0
+COPY --chown=bentoml:bentoml ./env/python ./env/python/
+# install python packages with install.sh
+RUN bash -euxo pipefail /home/bentoml/bento/env/python/install.sh
+COPY --chown=bentoml:bentoml . ./
+
+# Block SETUP_BENTO_ENTRYPOINT
+RUN rm -rf /var/lib/{apt,cache,log}
+# Default port for BentoServer
+EXPOSE 3000
+
+# Expose Prometheus port
+EXPOSE 3001
+
+RUN chmod +x /home/bentoml/bento/env/docker/entrypoint.sh
+
+USER bentoml
+
+ENTRYPOINT [ "/home/bentoml/bento/env/docker/entrypoint.sh" ]
+
+
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/docker/entrypoint.sh b/bentoml/bentos/llama3_2/11b-vision/env/docker/entrypoint.sh
new file mode 100644
index 00000000..df1892dd
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/docker/entrypoint.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+# check to see if this file is being run or sourced from another script
+_is_sourced() {
+ # https://unix.stackexchange.com/a/215279
+ [ "${#FUNCNAME[@]}" -ge 2 ] &&
+ [ "${FUNCNAME[0]}" = '_is_sourced' ] &&
+ [ "${FUNCNAME[1]}" = 'source' ]
+}
+
+_main() {
+ # For backwards compatibility with the yatai<1.0.0, adapting the old "yatai" command to the new "start" command.
+ if [ "${#}" -gt 0 ] && [ "${1}" = 'python' ] && [ "${2}" = '-m' ] && { [ "${3}" = 'bentoml._internal.server.cli.runner' ] || [ "${3}" = "bentoml._internal.server.cli.api_server" ]; }; then # SC2235, use { } to avoid subshell overhead
+ if [ "${3}" = 'bentoml._internal.server.cli.runner' ]; then
+ set -- bentoml start-runner-server "${@:4}"
+ elif [ "${3}" = 'bentoml._internal.server.cli.api_server' ]; then
+ set -- bentoml start-http-server "${@:4}"
+ fi
+ # If no arg or first arg looks like a flag.
+ elif [[ "$#" -eq 0 ]] || [[ "${1:0:1}" =~ '-' ]]; then
+ # This is provided for backwards compatibility with places where user may have
+ # discover this easter egg and use it in their scripts to run the container.
+ if [[ -v BENTOML_SERVE_COMPONENT ]]; then
+ echo "\$BENTOML_SERVE_COMPONENT is set! Calling 'bentoml start-*' instead"
+ if [ "${BENTOML_SERVE_COMPONENT}" = 'http_server' ]; then
+ set -- bentoml start-http-server "$@" "$BENTO_PATH"
+ elif [ "${BENTOML_SERVE_COMPONENT}" = 'grpc_server' ]; then
+ set -- bentoml start-grpc-server "$@" "$BENTO_PATH"
+ elif [ "${BENTOML_SERVE_COMPONENT}" = 'runner' ]; then
+ set -- bentoml start-runner-server "$@" "$BENTO_PATH"
+ fi
+ else
+ set -- bentoml serve "$@" "$BENTO_PATH"
+ fi
+ fi
+ # Overide the BENTOML_PORT if PORT env var is present. Used for Heroku and Yatai.
+ if [[ -v PORT ]]; then
+ echo "\$PORT is set! Overiding \$BENTOML_PORT with \$PORT ($PORT)"
+ export BENTOML_PORT=$PORT
+ fi
+ # Handle serve and start commands that is passed to the container.
+ # Assuming that serve and start commands are the first arguments
+ # Note that this is the recommended way going forward to run all bentoml containers.
+ if [ "${#}" -gt 0 ] && { [ "${1}" = 'serve' ] || [ "${1}" = 'serve-http' ] || [ "${1}" = 'serve-grpc' ] || [ "${1}" = 'start-http-server' ] || [ "${1}" = 'start-grpc-server' ] || [ "${1}" = 'start-runner-server' ]; }; then
+ exec bentoml "$@" "$BENTO_PATH"
+ else
+ # otherwise default to run whatever the command is
+ # This should allow running bash, sh, python, etc
+ exec "$@"
+ fi
+}
+
+if ! _is_sourced; then
+ _main "$@"
+fi
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/python/install.sh b/bentoml/bentos/llama3_2/11b-vision/env/python/install.sh
new file mode 100644
index 00000000..e92ea93a
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/python/install.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -exuo pipefail
+
+# Parent directory https://stackoverflow.com/a/246128/8643197
+BASEDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}"; )" &> /dev/null && pwd 2> /dev/null; )"
+
+pip_install() {
+ if command -v "uv" > /dev/null 2>&1; then
+ uv pip install "$@"
+ else
+ pip3 install "$@"
+ fi
+}
+
+PIP_ARGS=()
+
+# BentoML by default generates two requirement files:
+# - ./env/python/requirements.lock.txt: all dependencies locked to its version presented during `build`
+# - ./env/python/requirements.txt: all dependencies as user specified in code or requirements.txt file
+REQUIREMENTS_TXT="$BASEDIR/requirements.txt"
+REQUIREMENTS_LOCK="$BASEDIR/requirements.lock.txt"
+WHEELS_DIR="$BASEDIR/wheels"
+BENTOML_VERSION=${BENTOML_VERSION:-1.3.6}
+# Install python packages, prefer installing the requirements.lock.txt file if it exist
+pushd "$BASEDIR" &>/dev/null
+if [ -f "$REQUIREMENTS_LOCK" ]; then
+ echo "Installing pip packages from 'requirements.lock.txt'.."
+ pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_LOCK"
+else
+ if [ -f "$REQUIREMENTS_TXT" ]; then
+ echo "Installing pip packages from 'requirements.txt'.."
+ pip_install "${PIP_ARGS[@]}" -r "$REQUIREMENTS_TXT"
+ fi
+fi
+popd &>/dev/null
+
+# Attempt to expand the glob pattern. The nullglob option ensures that
+# the pattern itself is not returned if no files match.
+shopt -s nullglob
+wheels=($WHEELS_DIR/*.whl)
+
+if [ ${#wheels[@]} -gt 0 ]; then
+ echo "Installing wheels packaged in Bento.."
+ pip_install "${PIP_ARGS[@]}" "${wheels[@]}"
+fi
+
+
+# Install the BentoML from PyPI if it's not already installed
+if python3 -c "import bentoml" &> /dev/null; then
+ existing_bentoml_version=$(python3 -c "import bentoml; print(bentoml.__version__)")
+ if [ "$existing_bentoml_version" != "$BENTOML_VERSION" ]; then
+ echo "WARNING: using BentoML version ${existing_bentoml_version}"
+ fi
+else
+ pip_install bentoml=="$BENTOML_VERSION"
+fi
\ No newline at end of file
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.lock.txt b/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.lock.txt
new file mode 100644
index 00000000..482628a8
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.lock.txt
@@ -0,0 +1,154 @@
+--index-url https://pypi.org/simple
+
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.6
+aiosignal==1.3.1
+aiosqlite==0.20.0
+annotated-types==0.7.0
+anyio==4.6.0
+appdirs==1.4.4
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==24.2.0
+bentoml==1.3.6
+cattrs==23.1.2
+certifi==2024.8.30
+charset-normalizer==3.3.2
+circus==0.18.0
+click==8.1.7
+click-option-group==0.5.6
+cloudpickle==3.0.0
+datasets==2.14.4
+deepmerge==2.0
+deprecated==1.2.14
+dill==0.3.7
+diskcache==5.6.3
+distro==1.9.0
+einops==0.8.0
+exceptiongroup==1.2.2
+fastapi==0.115.0
+filelock==3.16.1
+frozenlist==1.4.1
+fs==2.4.16
+fsspec==2024.9.0
+gguf==0.10.0
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.2
+httpx-ws==0.6.0
+huggingface-hub==0.25.1
+idna==3.10
+importlib-metadata==6.11.0
+inflection==0.5.1
+interegular==0.3.3
+jinja2==3.1.4
+jiter==0.5.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+lark==1.2.2
+llvmlite==0.43.0
+lm-format-enforcer==0.10.6
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+mdurl==0.1.2
+mistral-common==1.4.3
+mpmath==1.3.0
+msgpack==1.1.0
+msgspec==0.18.6
+multidict==6.1.0
+multiprocess==0.70.15
+nest-asyncio==1.6.0
+networkx==3.2.1
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==11.525.150
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.68
+nvidia-nvtx-cu12==12.1.105
+openai==1.48.0
+opentelemetry-api==1.20.0
+opentelemetry-instrumentation==0.41b0
+opentelemetry-instrumentation-aiohttp-client==0.41b0
+opentelemetry-instrumentation-asgi==0.41b0
+opentelemetry-sdk==1.20.0
+opentelemetry-semantic-conventions==0.41b0
+opentelemetry-util-http==0.41b0
+outlines==0.0.46
+packaging==24.1
+pandas==2.2.3
+partial-json-parser==0.2.1.1.post4
+pathspec==0.12.1
+pillow==10.4.0
+pip-requirements-parser==32.0.1
+prometheus-client==0.21.0
+prometheus-fastapi-instrumentator==7.0.0
+prompt-toolkit==3.0.36
+protobuf==5.28.2
+psutil==6.0.0
+py-cpuinfo==9.0.0
+pyairports==2.1.1
+pyarrow==17.0.0
+pycountry==24.6.1
+pydantic==2.9.2
+pydantic-core==2.23.4
+pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.10
+pytz==2024.2
+pyyaml==6.0.2
+pyzmq==26.2.0
+questionary==2.0.1
+ray==2.37.0
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.8.1
+rpds-py==0.20.0
+safetensors==0.4.5
+schema==0.7.7
+sentencepiece==0.2.0
+setuptools==75.1.0
+simple-di==0.1.5
+six==1.16.0
+sniffio==1.3.1
+starlette==0.38.6
+sympy==1.13.3
+tiktoken==0.7.0
+tokenizers==0.20.0
+tomli==2.0.1
+tomli-w==1.0.0
+torch==2.4.0
+torchvision==0.19.0
+tornado==6.4.1
+tqdm==4.66.5
+transformers==4.45.0
+triton==3.0.0
+typing-extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uv==0.4.16
+uvicorn==0.30.6
+uvloop==0.20.0
+vllm==0.6.2
+watchfiles==0.24.0
+wcwidth==0.2.13
+websockets==13.1
+wrapt==1.16.0
+wsproto==1.2.0
+xformers==0.0.27.post2
+xxhash==3.5.0
+yarl==1.12.1
+zipp==3.20.2
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.txt b/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.txt
new file mode 100644
index 00000000..397c6499
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/python/requirements.txt
@@ -0,0 +1,6 @@
+vllm==0.6.2
+pyyaml
+# mistral_common[opencv]
+pillow
+openai
+bentoml==1.3.6
diff --git a/bentoml/bentos/llama3_2/11b-vision/env/python/version.txt b/bentoml/bentos/llama3_2/11b-vision/env/python/version.txt
new file mode 100644
index 00000000..42a3d342
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/env/python/version.txt
@@ -0,0 +1 @@
+3.9.19
\ No newline at end of file
diff --git a/bentoml/bentos/llama3_2/11b-vision/src/bentofile.yaml b/bentoml/bentos/llama3_2/11b-vision/src/bentofile.yaml
new file mode 100644
index 00000000..64505106
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/src/bentofile.yaml
@@ -0,0 +1,14 @@
+service: "service:VLLM"
+labels:
+ source: https://github.com/bentoml/openllm-models/tree/main/src/vllm-chat
+ platforms: linux
+include:
+- "*.py"
+- "*.yaml"
+- "ui/*"
+- "ui/chunks/*"
+- "ui/css/*"
+- "ui/media/*"
+- "ui/chunks/pages/*"
+python:
+ requirements_txt: "./requirements.txt"
diff --git a/bentoml/bentos/llama3_2/11b-vision/src/openllm_config.yaml b/bentoml/bentos/llama3_2/11b-vision/src/openllm_config.yaml
new file mode 100644
index 00000000..31ccddf1
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/src/openllm_config.yaml
@@ -0,0 +1,19 @@
+alias:
+ - 11b-vision
+project: vllm-chat
+bentoml:
+ service_args:
+ name: llama3_2
+ traffic:
+ timeout: 300
+ resources:
+ gpu: 1
+ gpu_type: nvidia-a100-80gb
+vllm:
+ engine_args:
+ model: meta-llama/Llama-3.2-11B-Vision-Instruct
+ enforce_eager: true
+ limit_mm_per_prompt:
+ image: 1
+ max_model_len: 16384
+ max_num_seqs: 16
diff --git a/bentoml/bentos/llama3_2/11b-vision/src/service.py b/bentoml/bentos/llama3_2/11b-vision/src/service.py
new file mode 100644
index 00000000..9c1c10fb
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/src/service.py
@@ -0,0 +1,185 @@
+import logging
+import traceback
+import base64
+import io
+import os
+from argparse import Namespace
+from typing import Literal, Optional, AsyncGenerator
+
+import bentoml
+import fastapi
+import fastapi.staticfiles
+import pydantic
+import vllm.entrypoints.openai.api_server as vllm_api_server
+import yaml
+from fastapi.responses import FileResponse
+import PIL.Image
+
+
+class URL(pydantic.BaseModel):
+ url: str
+
+
+class Content(pydantic.BaseModel):
+ type: Literal["text", "image_url"] = "text"
+ text: Optional[str] = None
+ image_url: Optional[URL] = None
+
+
+class Message(pydantic.BaseModel):
+ role: Literal["system", "user", "assistant"] = "user"
+ content: list[Content]
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+# Load the constants from the yaml file
+PARAMETER_YAML = os.path.join(os.path.dirname(__file__), "openllm_config.yaml")
+with open(PARAMETER_YAML) as f:
+ PARAMETERS = yaml.safe_load(f)
+
+ENGINE_CONFIG = PARAMETERS.get("vllm", {}).get("engine_args", {})
+SERVICE_CONFIG = PARAMETERS.get("bentoml", {}).get("service_args", {})
+
+
+# openai api app
+openai_api_app = fastapi.FastAPI()
+OPENAI_ENDPOINTS = [
+ ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
+ ["/completions", vllm_api_server.create_completion, ["POST"]],
+ ["/models", vllm_api_server.show_available_models, ["GET"]],
+]
+for route, endpoint, methods in OPENAI_ENDPOINTS:
+ openai_api_app.add_api_route(
+ path=route,
+ endpoint=endpoint,
+ methods=methods,
+ include_in_schema=True,
+ )
+
+
+# chat UI app
+ui_app = fastapi.FastAPI()
+STATIC_DIR = os.path.join(os.path.dirname(__file__), "ui")
+ui_app.mount(
+ "/static", fastapi.staticfiles.StaticFiles(directory=STATIC_DIR), name="static"
+)
+
+
+@ui_app.get("/")
+async def serve_chat_html():
+ return FileResponse(os.path.join(STATIC_DIR, "chat.html"))
+
+
+@ui_app.get("/{full_path:path}")
+async def catch_all(full_path: str):
+ file_path = os.path.join(STATIC_DIR, full_path)
+ if os.path.exists(file_path):
+ return FileResponse(file_path)
+ return FileResponse(os.path.join(STATIC_DIR, "chat.html"))
+
+
+@bentoml.mount_asgi_app(openai_api_app, path="/v1")
+@bentoml.mount_asgi_app(ui_app, path="/chat")
+@bentoml.service(**SERVICE_CONFIG)
+class VLLM:
+ def __init__(self) -> None:
+ from vllm import AsyncEngineArgs, AsyncLLMEngine
+ from vllm.entrypoints.openai.api_server import init_app_state
+
+ ENGINE_ARGS = AsyncEngineArgs(**ENGINE_CONFIG)
+ self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
+ logger.info(f"VLLM service initialized with model: {ENGINE_CONFIG['model']}")
+
+ model_config = self.engine.engine.get_model_config()
+
+ args = Namespace()
+ args.model = ENGINE_CONFIG["model"]
+ args.disable_log_requests = True
+ args.max_log_len = 1000
+ args.response_role = "assistant"
+ args.served_model_name = None
+ args.chat_template = None
+ args.lora_modules = None
+ args.prompt_adapters = None
+ args.request_logger = None
+ args.disable_log_stats = True
+ args.return_tokens_as_token_ids = False
+ args.enable_tool_call_parser = False
+ args.enable_auto_tool_choice = False
+ args.tool_call_parser = None
+
+ init_app_state(self.engine, model_config, openai_api_app.state, args)
+
+ @bentoml.api
+ async def generate(self, prompt: str = "what is this?") -> AsyncGenerator[str, None]:
+ from openai import AsyncOpenAI
+
+ client = AsyncOpenAI(
+ base_url="http://127.0.0.1:3000/v1",
+ api_key="dummy",
+ )
+ content = [
+ Content(
+ type="text",
+ text=prompt,
+ )
+ ]
+ message = Message(role="user", content=content)
+
+ try:
+ completion = await client.chat.completions.create(
+ model=ENGINE_CONFIG["model"],
+ messages=[message.model_dump()],
+ stream=True,
+ )
+ async for chunk in completion:
+ yield chunk.choices[0].delta.content or ""
+ except Exception:
+ yield traceback.format_exc()
+
+ @bentoml.api
+ async def generate_with_image(self, prompt: str = "what is this?", image: Optional[PIL.Image.Image] = None) -> AsyncGenerator[str, None]:
+ from openai import AsyncOpenAI
+
+ client = AsyncOpenAI(
+ base_url="http://127.0.0.1:3000/v1",
+ api_key="dummy",
+ )
+ if image:
+ buffered = io.BytesIO()
+ image.save(buffered, format="PNG")
+ img_str = base64.b64encode(buffered.getvalue()).decode()
+ buffered.close()
+ image_url = f"data:image/png;base64,{img_str}"
+ content = [
+ Content(
+ type="image_url",
+ image_url=URL(url=image_url),
+ ),
+ Content(
+ type="text",
+ text=prompt,
+ )
+ ]
+ else:
+ content = [
+ Content(
+ type="text",
+ text=prompt,
+ )
+ ]
+ message = Message(role="user", content=content)
+
+ try:
+ completion = await client.chat.completions.create(
+ model=ENGINE_CONFIG["model"],
+ messages=[message.model_dump()],
+ stream=True,
+ )
+ async for chunk in completion:
+ yield chunk.choices[0].delta.content or ""
+ except Exception:
+ yield traceback.format_exc()
diff --git a/bentoml/bentos/llama3_2/11b-vision/src/ui/404.html b/bentoml/bentos/llama3_2/11b-vision/src/ui/404.html
new file mode 100644
index 00000000..dbc065d4
--- /dev/null
+++ b/bentoml/bentos/llama3_2/11b-vision/src/ui/404.html
@@ -0,0 +1,5 @@
+
e||125