From 75b970b4f5c2729b6e05f655f29d5133a1c03a02 Mon Sep 17 00:00:00 2001 From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com> Date: Thu, 13 Jun 2024 22:22:54 -0400 Subject: [PATCH] [Doc] Update WebLLM doc (#2578) Update documentation for WebLLM. Currently we only provide a high-level view for WebLLM runtime here, and refer user to the WebLLM repo README for more. The documentation focuses on adding their own model variant / model library for WebLLM. Will follow up with more thorough runtime documentation. --- docs/deploy/webllm.rst | 174 ++++++++++++++++++++++------------------- docs/install/emcc.rst | 12 +++ 2 files changed, 104 insertions(+), 82 deletions(-) diff --git a/docs/deploy/webllm.rst b/docs/deploy/webllm.rst index 4fecf1723d..20cde05e51 100644 --- a/docs/deploy/webllm.rst +++ b/docs/deploy/webllm.rst @@ -7,70 +7,88 @@ WebLLM Javascript SDK :local: :depth: 2 -`WebLLM `_ is an MLC chat web runtime -that allows you to build chat applications directly in the browser, leveraging -`WebGPU `_ and providing users a natural layer of abstraction. +`WebLLM `_ is a high-performance in-browser LLM +inference engine, aiming to be the backend of AI-powered web applications and agents. -Try out the Prebuilt Webpage ----------------------------- +It provides a specialized runtime for the web backend of MLCEngine, leverages +`WebGPU `_ for local acceleration, offers OpenAI-compatible API, +and provides built-in support for web workers to separate heavy computation from the UI flow. + +Please checkout the `WebLLM repo `__ on how to use WebLLM to build +web application in Javascript/Typescript. Here we only provide a high-level idea and discuss how to +use MLC-LLM to compile your own model to run with WebLLM. -To get started, you can try out `WebLLM prebuilt webpage `__. +Getting Started +--------------- -A WebGPU-compatible browser and a local GPU are needed to run WebLLM. +To get started, try out `WebLLM Chat `__, which provides a great example +of integrating WebLLM into a full web application. + +A WebGPU-compatible browser is needed to run WebLLM-powered web applications. You can download the latest Google Chrome and use `WebGPU Report `__ to verify the functionality of WebGPU on your browser. +WebLLM is available as an `npm package `_ and is +also CDN-delivered. Try a simple chatbot example in +`this JSFiddle example `__ without setup. + +You can also checkout `existing examples `__ +on more advanced usage of WebLLM such as JSON mode, streaming, and more. -Use WebLLM NPM Package ----------------------- +Model Records in WebLLM +----------------------- -WebLLM is available as an `npm package `_. -The source code is available in `the WebLLM repo `_, -where you can make your own modifications and build from source. +Each of the model in `WebLLM Chat `__ is registered as an instance of +``ModelRecord`` and can be accessed at +`webllm.prebuiltAppConfig.model_list `__. -Note that the `WebLLM prebuilt webpage `__ above -is powered by the WebLLM npm package, specifically with the code in -the `simple-chat `__ example. +Looking at the most straightforward example `get-started `__, +there are two ways to run a model. -Each of the model in the `WebLLM prebuilt webpage `__ -is registered as an instance of ``ModelRecord``. Looking at the most straightforward example -`get-started `__, -we see the code snippet: +One can either use the prebuilt model by simply calling ``reload()`` with the ``model_id``: .. code:: typescript - const myAppConfig: AppConfig = { + const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC"; + const engine = await webllm.CreateMLCEngine(selectedModel); + +Or one can specify their own model to run by creating a model record: + +.. code:: typescript + + const appConfig: webllm.AppConfig = { model_list: [ { - "model_url": "https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f32_1-MLC/resolve/main/", - "local_id": "Llama-2-7b-chat-hf-q4f32_1", - "model_lib_url": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f32_1-ctx4k_cs1k-webgpu.wasm", - }, - { - "model_url": "https://huggingface.co/mlc-ai/Mistral-7B-Instruct-v0.2-q4f16_1-MLC/resolve/main/", - "local_id": "Mistral-7B-Instruct-v0.2-q4f16_1", - "model_lib_url": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/Mistral-7B-Instruct-v0.2/Mistral-7B-Instruct-v0.2-q4f16_1-sw4k_cs1k-webgpu.wasm", - "required_features": ["shader-f16"], + model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC", + model_id: "Llama-3-8B-Instruct-q4f32_1-MLC", + model_lib: + webllm.modelLibURLPrefix + + webllm.modelVersion + + "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm", }, // Add your own models here... - ] - } - const selectedModel = "Llama-2-7b-chat-hf-q4f32_1" - // const selectedModel = "Mistral-7B-Instruct-v0.1-q4f16_1" - await chat.reload(selectedModel, undefined, myAppConfig); + ], + }; + const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC"; + const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + selectedModel, + { appConfig: appConfig }, + ); -Just like any other platforms, to run a model with on WebLLM, you need: +Looking at the code above, we find that, just like any other platforms supported by MLC-LLM, to +run a model on WebLLM, you need: -1. **Model weights** converted to MLC format (e.g. `Llama-2-7b-hf-q4f32_1-MLC - `_.): downloaded through ``model_url`` -2. **Model library** that comprises the inference logic (see repo `binary-mlc-llm-libs `__): downloaded through ``model_lib_url``. +1. **Model weights** converted to MLC format (e.g. `Llama-3-8B-Instruct-q4f32_1-MLC + `_.): downloaded through the url ``ModelRecord.model`` +2. **Model library** that comprises the inference logic (see repo `binary-mlc-llm-libs `__): downloaded through the url ``ModelRecord.model_lib``. + +In sections below, we walk you through two examples on how to add your own model besides the ones in +`webllm.prebuiltAppConfig.model_list `__. +Before proceeding, please verify installation of ``mlc_llm`` and ``tvm``. Verify Installation for Adding Models ------------------------------------- -In sections below, we walk you through two examples of adding models to WebLLM. Before proceeding, -please verify installation of ``mlc_llm`` and ``tvm``: - **Step 1. Verify mlc_llm** We use the python package ``mlc_llm`` to compile models. This can be installed by @@ -106,7 +124,7 @@ In cases where the model you are adding is simply a variant of an existing model, we only need to convert weights and reuse existing model library. For instance: - Adding ``OpenMistral`` when MLC supports ``Mistral`` -- Adding ``Llama2-uncensored`` when MLC supports ``Llama2`` +- Adding a ``Llama3`` fine-tuned on a domain-specific task when MLC supports ``Llama3`` In this section, we walk you through adding ``WizardMath-7B-V1.1-q4f16_1`` to the @@ -150,23 +168,9 @@ See :ref:`compile-command-specification` for specification of ``gen_config``. --quantization q4f16_1 --conv-template wizard_coder_or_math \ -o dist/WizardMath-7B-V1.1-q4f16_1-MLC/ -For the ``conv-template``, `conversation_template.py `__ -contains a full list of conversation templates that MLC provides. - -If the model you are adding requires a new conversation template, you would need to add your own. -Follow `this PR `__ as an example. Besides, you also need to add the new template to ``/path/to/web-llm/src/conversation.ts``. -We look up the template to use with the ``conv_template`` field in ``mlc-chat-config.json``. - -For more details, please see :ref:`configure-mlc-chat-json`. - -.. note:: - - If you added your conversation template in ``src/conversation.ts``, you need to build WebLLM - from source following the instruction in - `the WebLLM repo's README `_. - - Alternatively, you could use the ``"custom"`` conversation template so that you can pass in - your own ``ConvTemplateConfig`` in runtime without having to build the package from source. +For the ``conv-template``, `conversation_template.py `__ +contains a full list of conversation templates that MLC provides. You can also manually modify the ``mlc-chat-config.json`` to +add your customized conversation template. **Step 3 Upload weights to HF** @@ -192,26 +196,30 @@ Finally, we modify the code snippet for `get-started `__ pasted above. -We simply specify the Huggingface link as ``model_url``, while reusing the ``model_lib_url`` for -``Mistral-7B``. Note that we need the suffix to be ``/resolve/main/``. +We simply specify the Huggingface link as ``model``, while reusing the ``model_lib`` for +``Mistral-7B``. .. code:: typescript - const myAppConfig: AppConfig = { + const appConfig: webllm.AppConfig = { model_list: [ - // Other records here omitted... { - // Substitute model_url with the one you created `my-huggingface-account/my-wizardMath-weight-huggingface-repo` - "model_url": "https://huggingface.co/mlc-ai/WizardMath-7B-V1.1-q4f16_1-MLC/resolve/main/", - "local_id": "WizardMath-7B-V1.1-q4f16_1", - "model_lib_url": "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/Mistral-7B-Instruct-v0.2/Mistral-7B-Instruct-v0.2-q4f16_1-sw4k_cs1k-webgpu.wasm", - "required_features": ["shader-f16"], + model: "https://huggingface.co/mlc-ai/WizardMath-7B-V1.1-q4f16_1-MLC", + model_id: "WizardMath-7B-V1.1-q4f16_1-MLC", + model_lib: + webllm.modelLibURLPrefix + + webllm.modelVersion + + "/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm", }, - ] - } + // Add your own models here... + ], + }; const selectedModel = "WizardMath-7B-V1.1-q4f16_1" - await chat.reload(selectedModel, undefined, myAppConfig); + const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + selectedModel, + { appConfig: appConfig }, + ); Now, running the ``get-started`` example will use the ``WizardMath`` model you just added. See `get-started's README `__ @@ -223,9 +231,9 @@ Bring Your Own Model Library A model library is specified by: - - The model architecture (e.g. ``llama-2``, ``gpt-neox``) + - The model architecture (e.g. ``llama-3``, ``gpt-neox``, ``phi-3``) - Quantization (e.g. ``q4f16_1``, ``q0f32``) - - Metadata (e.g. ``context_window_size``, ``sliding_window_size``, ``prefill-chunk-size``), which affects memory planning + - Metadata (e.g. ``context_window_size``, ``sliding_window_size``, ``prefill-chunk-size``), which affects memory planning (currently only ``prefill-chunk-size`` affects the compiled model) - Platform (e.g. ``cuda``, ``webgpu``, ``iOS``) In cases where the model you want to run is not compatible with the provided MLC @@ -288,9 +296,8 @@ All these knobs are specified in ``mlc-chat-config.json`` generated by ``gen_con --device webgpu -o dist/libs/RedPajama-INCITE-Chat-3B-v1-q4f16_1-webgpu.wasm .. note:: - When compiling larger models like ``Llama-2-7B``, you may want to add ``--prefill_chunk_size 1024`` or - lower ``context_window_size`` to decrease memory usage. Otherwise, during runtime, - you may run into issues like: + When compiling larger models like ``Llama-3-8B``, you may want to add ``--prefill_chunk_size 1024`` + to decrease memory usage. Otherwise, during runtime, you may run into issues like: .. code:: text @@ -344,17 +351,20 @@ Finally, we are able to run the model we added in WebLLM's `get-started `__ -on how to run it. \ No newline at end of file +on how to run it. diff --git a/docs/install/emcc.rst b/docs/install/emcc.rst index 64a14f817b..79b4032f33 100644 --- a/docs/install/emcc.rst +++ b/docs/install/emcc.rst @@ -21,6 +21,18 @@ Validate that emcc is accessible in shell emcc --version +.. note:: + We recently found that using the latest ``emcc`` version may run into issues during runtime. Use + ``./emsdk install 3.1.56`` instead of ``./emsdk install latest`` for now as a workaround. + + The error may look like + + .. code:: text + + Init error, LinkError: WebAssembly.instantiate(): Import #6 module="wasi_snapshot_preview1" + function="proc_exit": function import requires a callable + + Step 2: Set TVM_SOURCE_DIR and MLC_LLM_SOURCE_DIR -------------------------------------------------