Skip to content

Commit

Permalink
update gpt-neox image to gpt-neox v2.0 and combine with gpt-neox build
Browse files Browse the repository at this point in the history
  • Loading branch information
liamcli committed May 16, 2023
1 parent cc5289c commit f323f71
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 9 deletions.
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,10 @@ build-pytorch10-tf27-rocm50:
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

DEEPSPEED_VERSION := 0.8.3
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
DEEPSPEED_VERSION := 0.9.2
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1

# This builds deepspeed environment off of upstream microsoft/DeepSpeed.
Expand Down Expand Up @@ -239,8 +239,8 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg "$(NCCL_BUILD_ARG)" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@determined2#egg=deepspeed" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
Expand Down
29 changes: 26 additions & 3 deletions dockerfile_scripts/install_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,30 @@
set -e

DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
# Triton is needed to build deepspeed's sparse_attn operation.
python -m pip install triton==1.0.0
DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
# Not building sparse attn operation which depends on a very old version of triton
DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
python -m deepspeed.env_report

if [[ "$DEEPSPEED_PIP" == *"determined2"* ]]; then
# Build gpt-neox and dependencies when we install the gpt-neox version of deepspeed.
# Triton is needed for flash attn
python -m pip install triton==2.0.0.dev20221202
# This is a dependency of gpt-neox
apt-get install -y mpich
# Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894.
pip install setuptools==59.5.0
# Install gpt-neox and dependencies
git clone -b determined2 https://github.com/determined-ai/gpt-neox.git
python gpt-neox/megatron/fused_kernels/setup.py install

# Exclude DeeperSpeed reinstall since the version in requirements is not pinned.
pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt)
pip install -r /gpt-neox/requirements/requirements-flashattention.txt

# Download sample data
gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data

# Modify permissions to enable example to run in nonroot mode
chmod -R 777 /gpt-neox
chmod -R 777 /tmp
fi

0 comments on commit f323f71

Please sign in to comment.