diff --git a/06_gpu_and_ml/llm-serving/llama_cpp.py b/06_gpu_and_ml/llm-serving/llama_cpp.py index b89bed7c3..d192021ee 100644 --- a/06_gpu_and_ml/llm-serving/llama_cpp.py +++ b/06_gpu_and_ml/llm-serving/llama_cpp.py @@ -7,7 +7,7 @@ # It's lightweight, fast, and includes support for exotic quantizations like 5-bit integers. # This example shows how you can run `llama.cpp` on Modal. -# We start by defining a container image with `llama.cpp` installed. +# We start by defining a [container image](https://modal.com/docs/guide/custom-container) with `llama.cpp` installed. import modal @@ -28,19 +28,42 @@ # We use a model with 5-bit quantization. # The model format, `.gguf`, is a custom format used by `llama.cpp`. -MODEL_NAME = "Meta-Llama-3.1-8B-Instruct" +ORG_NAME = "bartowski" +MODEL_NAME = "Meta-Llama-3.1-8B-Instruct-GGUF" +REPO_ID = f"{ORG_NAME}/{MODEL_NAME}" MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf" REVISION = "9a8dec50f04fa8fad1dc1e7bc20a84a512e2bb01" -image = image.run_commands( - f"curl --fail-with-body -L -O https://huggingface.co/bartowski/{MODEL_NAME}-GGUF/resolve/{REVISION}/{MODEL_FILE}?download=true" + +def download_model(repo_id, filename, revision): + from huggingface_hub import hf_hub_download + + hf_hub_download( + repo_id=repo_id, + filename=filename, + revision=revision, + local_dir="/", + ) + + +# We can execute this Python function as part of building our image, +# just as we can install dependencies and set environment variables, +# with the `run_function` method: + +image = ( + image.pip_install("huggingface_hub[hf_transfer]==0.26.2") + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) + .run_function(download_model, args=(REPO_ID, MODEL_FILE, REVISION)) ) -# Now, we're ready to define a serverless function that runs `llama.cpp`. + +# Now, we're ready to define a serverless function that runs `llama.cpp`! + # We wrap that function with a decorator from a Modal App, -# `@app.function` specifying the image it should run on +# `@app.function`, specifying the image it should run on # and setting the maximum number of concurrent replicas -# (here, `100`, which is the default). +# (here, `100`, which is the default for CPU Functions). + app = modal.App("llama-cpp-modal", image=image) @@ -66,7 +89,8 @@ def llama_cpp_inference( str(num_output_tokens), "-p", prompt, - ] + ], + check=True, )