Skip to content

Commit

Permalink
Merge branch 'main' into op/layernorm_kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
dongxianzhe authored Apr 22, 2024
2 parents 4dc3b5f + dff774e commit e18e337
Show file tree
Hide file tree
Showing 177 changed files with 8,323 additions and 3,095 deletions.
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Checks: >
-cppcoreguidelines-non-private-member-variables-in-classes,
-cppcoreguidelines-pro-type-reinterpret-cast,
-cppcoreguidelines-macro-usage,
-cppcoreguidelines-owning-memory,
HeaderFilterRegex: '.*'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
build_type: [Release]

runs-on: [self-hosted, linux, x64, 1gpu]
runs-on: [self-hosted, linux, x64, 1gpu, 32g]

env:
BUILD_TYPE: ${{ matrix.build_type }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

jobs:
publish_scalellm:
runs-on: [self-hosted, linux, x64, 1gpu]
runs-on: [self-hosted, linux, x64, 1gpu, 128g]
steps:
- uses: olegtarasov/get-tag@v2.1
id: tagName
Expand Down
36 changes: 36 additions & 0 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: clang-format
on:
pull_request:
branches:
- main

jobs:
clang-format:
runs-on: ubuntu-latest
steps:
- name: Install clang-format
run: |
sudo apt-get install -y clang-format colordiff
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Run clang-format
run: |
diff=`git-clang-format --extensions="c,h,m,mm,cc,cp,cpp,c++,cxx,hh,hpp,hxx,inc,cu,cuh,proto,protodevel" --diff --commit ${{ github.event.pull_request.base.sha }}`
[ "$diff" = "no modified files to format" ] && exit 0
[ "$diff" = "clang-format did not modify any files" ] && exit 0
printf "\nYou have introduced coding style breakages. You can:\n"
echo "1> Fix the errors with git-clang-format:"
echo " git-clang-format --commit ${{ github.event.pull_request.base.sha }}"
echo "2> Disable checks on section of the code with:"
echo " // clang-format off"
echo " code"
echo " // clang-format on"
printf "\n\033[1mSuggested changes:\n\n"
echo "$diff" | colordiff
exit 1
6 changes: 3 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = git@github.com:pybind/pybind11.git
url = https://github.com/pybind/pybind11.git
[submodule "third_party/flashinfer"]
path = third_party/flashinfer
url = git@github.com:vectorch-ai/flashinfer.git
url = https://github.com/vectorch-ai/flashinfer.git
[submodule "third_party/cutlass"]
path = third_party/cutlass
url = git@github.com:NVIDIA/cutlass.git
url = https://github.com/NVIDIA/cutlass.git
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ else()
endif()

# carry over torch flags to the rest of the project
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG -flto=auto")
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

Expand All @@ -182,7 +182,7 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}
-U__CUDA_NO_HALF_CONVERSIONS__
-U__CUDA_NO_HALF2_OPERATORS__
-U__CUDA_NO_BFLOAT16_CONVERSIONS__)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math -Xfatbin -compress-all)
message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")

# enable testing in this directory so we can do a top-level `make test`.
Expand Down
32 changes: 30 additions & 2 deletions Dockerfile.devel
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,27 @@ RUN apt-get update -q -y && \
cmake \
ccache \
python3-dev \
python3-pip \
zip \
pkg-config \
libssl-dev \
libboost-all-dev \
curl \
git
git \
wget

# install jemalloc (optional)
RUN cd /tmp && \
wget -nc --no-check-certificate https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
tar -xvf jemalloc-5.3.0.tar.bz2 && \
(cd jemalloc-5.3.0 && \
./configure --enable-prof --disable-initial-exec-tls && \
make -j$(nproc) && make install && \
ldconfig)

# install nsys
ADD https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_2/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb .
RUN apt-get install -y ./nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
Expand All @@ -30,7 +45,20 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
RUN chown -R $UID:$GID /usr/local/rustup
RUN chown -R $UID:$GID /usr/local/cargo

# TODO: install golang
# Install miniconda
RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/Miniconda3-latest-Linux-x86_64.sh
RUN cd /tmp && \
chmod +x Miniconda3-latest-Linux-x86_64.sh && \
bash ./Miniconda3-latest-Linux-x86_64.sh -b -u

# Test activate miniconda
RUN . ${HOME}/miniconda3/etc/profile.d/conda.sh && \
conda activate base && \
conda init

RUN echo "\
. \${HOME}/miniconda3/etc/profile.d/conda.sh\n\
conda activate base\n" >> ${HOME}/.bashrc

CMD ["/bin/bash"]

Expand Down
68 changes: 27 additions & 41 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
# ScaleLLM: An efficient LLM Inference solution
[![build and test](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml) [![GitHub Repo stars](https://img.shields.io/github/stars/vectorch-ai/ScaleLLM?style=social)](https://github.com/vectorch-ai/ScaleLLM/stargazers)
[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![GitHub Repo stars](https://img.shields.io/github/stars/vectorch-ai/ScaleLLM?style=social)](https://github.com/vectorch-ai/ScaleLLM/stargazers) [![build and test](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml)

[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn)](https://discord.gg/PKe5gvBZfn)

[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn)](https://discord.gg/PKe5gvBZfn)

> **Warning**<br />
> ScaleLLM is currently in the active development stage and may not yet provide the optimal level of inference efficiency. We are fully dedicated to continuously enhancing its efficiency while also adding more features.
[ScaleLLM]() is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including [Llama3](https://github.com/meta-llama/llama3), [Gemma](https://github.com/google-deepmind/gemma), Bloom, GPT-NeoX, and more.

ScaleLLM is currently undergoing active development. We are fully committed to consistently enhancing its efficiency while also incorporating additional features. Feel free to explore our [**_Roadmap_**](https://github.com/vectorch-ai/ScaleLLM/issues/84) for more details.

In the coming weeks, we have exciting plans to focus on [**_speculative decoding_**](https://github.com/orgs/vectorch-ai/projects/1) and [**_stateful conversation_**](https://github.com/orgs/vectorch-ai/projects/2), alongside further kernel optimizations. We appreciate your understanding and look forward to delivering an even better solution.

## News:
* [03/2024] - [Advanced feature](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.7) support for CUDA graph, [dynamic prefix cache](), [dynamic chunked prefill]() and [speculative decoding]().
* [11/2023] - [First release](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.1) with support for popular [open-source models](#supported-models).

## Latest News:
* [11/2023] - First [official release](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.1) with support for popular open-source models.
## Key Features

- [High Efficiency](): Excels in high-performance LLM inference, leveraging state-of-the-art techniques and technologies like [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Paged Attention](https://github.com/vllm-project/vllm), [Continuous batching](https://www.anyscale.com/blog/continuous-batching-llm-inference), and more.
- [Tensor Parallelism](): Utilizes tensor parallelism for efficient model execution.
- [OpenAI-compatible API](): An efficient [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) rest api server that compatible with OpenAI.
- [Huggingface models](): Seamless integration with most popular [HF models](#supported-models), supporting safetensors.
- [Customizable](): Offers flexibility for customization to meet your specific needs, and provides an easy way to add new models.
- [Production Ready](): Engineered with production environments in mind, ScaleLLM is equipped with robust system monitoring and management features to ensure a seamless deployment experience.

## Table of contents

- [Overview](#overview)
- [Supported Models](#supported-models)
- [Get Started](#get-started)
- [ScaleLLM server](#scalellm-server)
Expand All @@ -32,42 +37,20 @@ In the coming weeks, we have exciting plans to focus on [**_speculative decoding
- [Acknowledgements](#acknowledgements)
- [License](#license)


## Overview

ScaleLLM is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including Llama2, Bloom, GPT-NeoX, and more.

## Key Features

- [High Efficiency](): Excels in high-performance LLM inference, leveraging state-of-the-art techniques and technologies like [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Paged Attention](https://github.com/vllm-project/vllm), [Continuous batching](https://www.anyscale.com/blog/continuous-batching-llm-inference), and more.
- [Tensor Parallelism](): Utilizes tensor parallelism for efficient model execution.
- [OpenAI-compatible API](): An efficient [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) rest api server that compatible with OpenAI.
- [Huggingface models](): Seamless integration with most popular [HF models](#supported-models), supporting safetensors.
- [Customizable](): Offers flexibility for customization to meet your specific needs, and provides an easy way to add new models.
- [Production Ready](): Engineered with production environments in mind, ScaleLLM is equipped with robust system monitoring and management features to ensure a seamless deployment experience.


## Supported Models

Please note that in order to use Yi models, you need to add `--model_type=Yi` to the command line. For example:
```bash
docker run -it --gpus=all --net=host --shm-size=1g \
-v $HOME/.cache/huggingface/hub:/models \
-e HF_MODEL_ID=01-ai/Yi-34B-Chat-4bits \
-e DEVICE=auto \
docker.io/vectorchai/scalellm:latest --logtostderr --model_type=Yi
```

| Models | Tensor Parallel | Quantization | Chat API | HF models examples |
| :--------: | :-------------: | :----------: | :------: | :---------------------------:|
| Aquila | Yes | Yes | Yes | [BAAI/Aquila-7B](https://huggingface.co/BAAI/Aquila-7B), [BAAI/AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) |
| Bloom | Yes | Yes | No | [bigscience/bloom](https://huggingface.co/bigscience/bloom) |
| Baichuan | Yes | Yes | Yes | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) |
| ChatGLM3 | Yes | Yes | Yes | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) |
| Gemma | Yes | Yes | Yes | [google/gemma-2b](https://huggingface.co/google/gemma-2b) |
| GPT_j | Yes | Yes | No | [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b) |
| GPT_NeoX | Yes | Yes | No | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) |
| GPT2 | Yes | Yes | No | [gpt2](https://huggingface.co/gpt2)|
| InternLM | Yes | Yes | Yes | [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) |
| Llama2 | Yes | Yes | Yes | [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b), [TheBloke/Llama-2-13B-chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ), [TheBloke/Llama-2-70B-AWQ](https://huggingface.co/TheBloke/Llama-2-70B-AWQ) |
| Llama3/2 | Yes | Yes | Yes | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct), [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) |
| Mistral | Yes | Yes | Yes | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) |
| MPT | Yes | Yes | Yes | [mosaicml/mpt-30b](https://huggingface.co/mosaicml/mpt-30b) |
| Phi2 | Yes | Yes | No | [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) |
Expand Down Expand Up @@ -96,9 +79,10 @@ You can download and install Docker from the official website: [Docker Installat
Once you have Docker installed, you can run ScaleLLM Docker container with [latest image](https://hub.docker.com/r/vectorchai/scalellm/tags) using the following command:

```bash
docker pull docker.io/vectorchai/scalellm:latest
docker run -it --gpus=all --net=host --shm-size=1g \
-v $HOME/.cache/huggingface/hub:/models \
-e HF_MODEL_ID=TheBloke/Llama-2-7B-chat-AWQ \
-e HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct \
-e DEVICE=cuda:0 \
docker.io/vectorchai/scalellm:latest --logtostderr
```
Expand All @@ -109,7 +93,7 @@ This command starts the Docker container with GPU support and various configurat
- `HF_MODEL_REVISION` specifies which Hugging Face model revision you want to run. By default, it is set to `"main"`.
- `DEVICE` specifies the device on which this model should run. By default, it is set to `"auto"`, using all available GPUs. You can also specify specific GPUs by using `"cuda:0,cuda:1"`, or use CPU by using `"cpu"`.
- `HF_MODEL_ALLOW_PATTERN` specifies which types of files are allowed to be downloaded. By default, it will be configured automatically based on tensor type. Only use this option if the default configuration is not working for you.
- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models.
- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models. `-e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN`

> **Warning**<br />
> * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' could be changed to a new version upon new release. In order to use latest image, you may need to repull the image with specific tag.
Expand Down Expand Up @@ -139,6 +123,7 @@ After running the Docker container, two ports are exposed:
You can also start a REST API gateway with [latest image](https://hub.docker.com/r/vectorchai/scalellm-gateway/tags) using the following command:

```bash
docker pull docker.io/vectorchai/scalellm-gateway:latest
docker run -it --net=host \
docker.io/vectorchai/scalellm-gateway:latest --logtostderr
```
Expand All @@ -150,6 +135,7 @@ The REST API Server is available on `localhost:8080`. You can use REST API reque
A local Chatbot UI is also available on [localhost:3000](localhost:3000). You can start it with [latest image](https://hub.docker.com/r/vectorchai/chatbot-ui/tags) using the following command:

```bash
docker pull docker.io/vectorchai/chatbot-ui:latest
docker run -it --net=host \
-e OPENAI_API_HOST=http://127.0.0.1:8080 \
-e OPENAI_API_KEY=YOUR_API_KEY \
Expand All @@ -162,7 +148,7 @@ Using Docker Compose is the easiest way to run ScaleLLM with all the services to

```bash
curl https://raw.githubusercontent.com/vectorch-ai/ScaleLLM/main/scalellm.yml -sSf > scalellm_compose.yml
HF_MODEL_ID=TheBloke/Llama-2-7B-chat-AWQ DEVICE=cuda docker compose -f ./scalellm_compose.yml up
HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct DEVICE=cuda docker compose -f ./scalellm_compose.yml up
```

you will get following running services:
Expand All @@ -180,7 +166,7 @@ You can get chat completions with the following example:
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "TheBloke/Llama-2-7B-chat-AWQ",
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"messages": [
{
"role": "system",
Expand All @@ -205,7 +191,7 @@ openai.api_base = "http://localhost:8080/v1"
print("==== Available models ====")
models = openai.Model.list()

model = "TheBloke/Llama-2-7B-chat-AWQ"
model = "meta-llama/Meta-Llama-3-8B-Instruct"

completion = openai.ChatCompletion.create(
model=model,
Expand All @@ -232,7 +218,7 @@ For regular completions, you can use this example:
curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "TheBloke/Llama-2-7B-chat-AWQ",
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"prompt": "hello",
"max_tokens": 32,
"temperature": 0.7,
Expand All @@ -251,7 +237,7 @@ openai.api_base = "http://localhost:8080/v1"
print("==== Available models ====")
models = openai.Model.list()

model = "TheBloke/Llama-2-7B-chat-AWQ"
model = "meta-llama/Meta-Llama-3-8B-Instruct"

completion = openai.Completion.create(
model=model,
Expand Down
1 change: 0 additions & 1 deletion bindings/python/CMakeLists.txt

This file was deleted.

Loading

0 comments on commit e18e337

Please sign in to comment.