diff --git a/src/llamacpp-chat/openllm_config.yaml b/src/llamacpp-chat/openllm_config.yaml index f18bf631..908e7ffb 100644 --- a/src/llamacpp-chat/openllm_config.yaml +++ b/src/llamacpp-chat/openllm_config.yaml @@ -1,11 +1,35 @@ -project: llamacpp-chat -service_config: - name: phi3 - traffic: - timeout: 300 - resources: - memory: 3Gi -engine_config: - model: microsoft/Phi-3-mini-4k-instruct-gguf - max_model_len: 2048 -chat_template: phi-3 + project: llamacpp-chat + engine_config: + max_model_len: 2048 + additional_files: + - qwen2.5-32b-instruct-fp16-00001-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00002-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00003-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00004-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00005-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00006-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00007-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00008-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00009-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00010-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00011-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00012-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00013-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00014-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00015-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00016-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00017-of-00017.gguf + filename: qwen2.5-32b-instruct-fp16-00001-of-00017.gguf + repo_id: Qwen/Qwen2.5-32B-Instruct-GGUF + extra_labels: + model_name: Qwen/Qwen2.5-72B-Instruct-GGUF + openllm_alias: 32b-ggml-fp16 + service_config: + name: qwen2.5 + resources: + memory: 60Gi + traffic: + timeout: 300 + extra_envs: + - name: CMAKE_ARGS + value: "-DGGML_METAL=on" diff --git a/src/llamacpp-chat/service.py b/src/llamacpp-chat/service.py index 5059d2ca..cd5ee615 100644 --- a/src/llamacpp-chat/service.py +++ b/src/llamacpp-chat/service.py @@ -25,7 +25,6 @@ ENGINE_CONFIG = CONSTANTS["engine_config"] SERVICE_CONFIG = CONSTANTS["service_config"] -OVERRIDE_CHAT_TEMPLATE = CONSTANTS.get("chat_template") class Message(pydantic.BaseModel): role: Literal["system", "user", "assistant"] @@ -43,7 +42,7 @@ async def show_available_models(): return { "data":[ { - "id": ENGINE_CONFIG["model"], + "id": ENGINE_CONFIG["repo_id"], "object": "model", "created": 1686935002, "owned_by": "bentoml", @@ -82,8 +81,7 @@ class LlamaCppChat: def __init__(self) -> None: from llama_cpp import Llama self.llm = Llama.from_pretrained( - repo_id=ENGINE_CONFIG["model"], - filename=ENGINE_CONFIG["filename"], + **ENGINE_CONFIG, verbose=False, ) @@ -93,7 +91,7 @@ async def chat_completions( messages: list[Message] = [ {"role": "user", "content": "What is the meaning of life?"} ], - model: str = ENGINE_CONFIG["model"], + model: str = ENGINE_CONFIG["repo_id"], max_tokens: Annotated[ int, Ge(128), diff --git a/src/recipe.yaml b/src/recipe.yaml index 0529f17c..6740bfee 100644 --- a/src/recipe.yaml +++ b/src/recipe.yaml @@ -767,7 +767,7 @@ resources: memory: 3Gi engine_config: - model: microsoft/Phi-3-mini-4k-instruct-gguf + repo_id: microsoft/Phi-3-mini-4k-instruct-gguf max_model_len: 2048 filename: "Phi-3-mini-4k-instruct-q4.gguf" extra_labels: @@ -817,7 +817,7 @@ model_name: mistral-community/pixtral-12b-240910 extra_requirements: - mistral_common[opencv] -'llama3.2:1b-instruct-fp16-ggml-darwin': +'llama3.2:1b-instruct-ggml-fp16-darwin': project: llamacpp-chat service_config: name: llama3.2 @@ -827,17 +827,17 @@ cpu: 1 memory: 3Gi engine_config: - model: unsloth/Llama-3.2-1B-Instruct-GGUF + repo_id: unsloth/Llama-3.2-1B-Instruct-GGUF max_model_len: 2048 filename: "Llama-3.2-1B-Instruct-F16.gguf" extra_labels: - openllm_alias: llama3.2,1b-instruct-fp16-ggml-darwin + openllm_alias: llama3.2,1b-instruct-ggml-fp16-darwin model_name: unsloth/Llama-3.2-1B-Instruct-GGUF platforms: macos extra_envs: - name: CMAKE_ARGS value: "-DGGML_METAL=on" -'llama3.2:1b-instruct-fp16-ggml-linux': +'llama3.2:1b-instruct-ggml-fp16-linux': project: llamacpp-chat service_config: name: llama3.2 @@ -847,13 +847,80 @@ cpu: 1 memory: 3Gi engine_config: - model: unsloth/Llama-3.2-1B-Instruct-GGUF + repo_id: unsloth/Llama-3.2-1B-Instruct-GGUF max_model_len: 2048 filename: "Llama-3.2-1B-Instruct-F16.gguf" extra_labels: - openllm_alias: llama3.2,1b-instruct-fp16-ggml-linux + openllm_alias: llama3.2,1b-instruct-ggml-fp16-linux model_name: unsloth/Llama-3.2-1B-Instruct-GGUF platforms: linux extra_envs: - name: CMAKE_ARGS - value: "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ No newline at end of file + value: "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" +'qwen2.5:72b-instruct-ggml-q4-darwin': + project: llamacpp-chat + engine_config: + max_model_len: 2048 + additional_files: + - qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00002-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00003-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00004-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00005-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00006-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00007-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00008-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00009-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00010-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00011-of-00012.gguf + - qwen2.5-72b-instruct-q4_k_m-00012-of-00012.gguf + filename: qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf + repo_id: Qwen/Qwen2.5-72B-Instruct-GGUF + extra_labels: + model_name: Qwen/Qwen2.5-72B-Instruct-GGUF + openllm_alias: 72b-ggml-q4 + service_config: + name: qwen2.5 + resources: + memory: 60Gi + traffic: + timeout: 300 + extra_envs: + - name: CMAKE_ARGS + value: "-DGGML_METAL=on" +'qwen2.5:32b-instruct-ggml-fp16-darwin': + project: llamacpp-chat + engine_config: + max_model_len: 2048 + additional_files: + - qwen2.5-32b-instruct-fp16-00001-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00002-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00003-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00004-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00005-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00006-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00007-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00008-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00009-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00010-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00011-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00012-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00013-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00014-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00015-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00016-of-00017.gguf + - qwen2.5-32b-instruct-fp16-00017-of-00017.gguf + filename: qwen2.5-32b-instruct-fp16-00001-of-00017.gguf + repo_id: Qwen/Qwen2.5-32B-Instruct-GGUF + extra_labels: + model_name: Qwen/Qwen2.5-72B-Instruct-GGUF + openllm_alias: 32b-ggml-fp16 + service_config: + name: qwen2.5 + resources: + memory: 60Gi + traffic: + timeout: 300 + extra_envs: + - name: CMAKE_ARGS + value: "-DGGML_METAL=on"