From d9cba4c86379b2bf2e3c74670dafb8d3b6697625 Mon Sep 17 00:00:00 2001
From: gongy <richard@modal.com>
Date: Fri, 4 Aug 2023 14:58:57 +0000
Subject: [PATCH] Lint

---
 06_gpu_and_ml/vllm_hosted.py | 56 ++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 12 deletions(-)
diff --git a/06_gpu_and_ml/vllm_hosted.py b/06_gpu_and_ml/vllm_hosted.py
index 430fd0896..3dac9cbdf 100644
--- a/06_gpu_and_ml/vllm_hosted.py
+++ b/06_gpu_and_ml/vllm_hosted.py
@@ -2,15 +2,33 @@
 import os
 import json
 
-from modal import Stub, Mount, Image, Secret, Dict, asgi_app, web_endpoint, method, gpu
+from modal import (
+    Stub,
+    Mount,
+    Image,
+    Secret,
+    Dict,
+    asgi_app,
+    web_endpoint,
+    method,
+    gpu,
+)
 
 from pathlib import Path
 
 MODEL_DIR = "/model"
+
+
 def download_model_to_folder():
     from huggingface_hub import snapshot_download
 
-    snapshot_download("meta-llama/Llama-2-13b-chat-hf", local_dir=MODEL_DIR, local_dir_use_symlinks=False, token=os.environ["HUGGINGFACE_TOKEN"])
+    snapshot_download(
+        "meta-llama/Llama-2-13b-chat-hf",
+        local_dir=MODEL_DIR,
+        local_dir_use_symlinks=False,
+        token=os.environ["HUGGINGFACE_TOKEN"],
+    )
+
 
 vllm_image = (
     Image.from_dockerhub("nvcr.io/nvidia/pytorch:22.12-py3")
@@ -21,14 +39,23 @@ def download_model_to_folder():
     .pip_install(
         "vllm @ git+https://github.com/vllm-project/vllm.git@d7a1c6d614756b3072df3e8b52c0998035fb453f"
     )
-    .run_function(download_model_to_folder, secret=Secret.from_name("huggingface"))
+    .run_function(
+        download_model_to_folder, secret=Secret.from_name("huggingface")
+    )
 )
 
 stub = Stub("llama-demo")
 stub.dict = Dict.new()
 
+
 # vLLM class
-@stub.cls(gpu=gpu.A100(), image=vllm_image, allow_concurrent_inputs=60, concurrency_limit=1, container_idle_timeout=600)
+@stub.cls(
+    gpu=gpu.A100(),
+    image=vllm_image,
+    allow_concurrent_inputs=60,
+    concurrency_limit=1,
+    container_idle_timeout=600,
+)
 class Engine:
     def __enter__(self):
         from vllm.engine.arg_utils import AsyncEngineArgs
@@ -40,7 +67,7 @@ def __enter__(self):
         engine_args = AsyncEngineArgs(
             model=MODEL_DIR,
             # Only uses 90% of GPU memory by default
-            gpu_memory_utilization=0.95
+            gpu_memory_utilization=0.95,
         )
 
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -51,7 +78,7 @@ def __enter__(self):
 <</SYS>>
 
 {} [/INST] """
-    
+
     def generated(self, n: int):
         # Log that n tokens have been generated
         t = time.time()
@@ -60,10 +87,10 @@ def generated(self, n: int):
         if t - self.last_report > 1.0:
             stub.app.dict.update(
                 tps=self.generated_tokens / (t - self.last_report),
-                t=self.last_report
+                t=self.last_report,
             )
             self.last_report, self.generated_tokens = t, 0
-    
+
     @method()
     async def completion(self, question: str):
         if not question:
@@ -80,12 +107,14 @@ async def completion(self, question: str):
             max_tokens=1024,
         )
         request_id = random_uuid()
-        results_generator = self.engine.generate(self.template.format(question), sampling_params, request_id)
+        results_generator = self.engine.generate(
+            self.template.format(question), sampling_params, request_id
+        )
 
         t0 = time.time()
         index, tokens = 0, 0
         async for request_output in results_generator:
-            if '\ufffd' == request_output.outputs[0].text[-1]:
+            if "\ufffd" == request_output.outputs[0].text[-1]:
                 continue
             yield request_output.outputs[0].text[index:]
             index = len(request_output.outputs[0].text)
@@ -103,6 +132,7 @@ async def completion(self, question: str):
 # Front-end functionality
 frontend_path = Path(__file__).parent / "vllm-hosted"
 
+
 @stub.function(
     mounts=[Mount.from_local_dir(frontend_path, remote_path="/assets")],
     keep_warm=3,
@@ -142,9 +172,11 @@ async def get(question: str):
         # FastAPI will run this in a separate thread
         def generate():
             for chunk in Engine().completion.call(unquote(question)):
-                yield f'data: {json.dumps(dict(text=chunk), ensure_ascii=False)}\n\n'
+                yield f"data: {json.dumps(dict(text=chunk), ensure_ascii=False)}\n\n"
 
         return StreamingResponse(generate(), media_type="text/event-stream")
 
-    web_app.mount("/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True))
+    web_app.mount(
+        "/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True)
+    )
     return web_app