diff --git a/Makefile b/Makefile index 4ea8ddfe..0530b5a7 100644 --- a/Makefile +++ b/Makefile @@ -215,6 +215,7 @@ build-deepspeed-gpu: build-gpu-cuda-113-base # This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed # that we need for gpt-neox support. .PHONY: build-gpt-neox-deepspeed-gpu +build-gpt-neox-deepspeed-gpu: export DOCKER_BUILDKIT=0 build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base docker build -f Dockerfile-default-gpu \ --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_113_BASE_NAME)-$(SHORT_GIT_HASH)" \ @@ -223,7 +224,7 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ --build-arg DET_BUILD_NCCL="" \ - --build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \ + --build-arg DEEPSPEED_PIP="git+https://github.com/EleutherAI/DeeperSpeed.git@0a237296f760efd4f58eb3c32b6cdc429a39041a#egg=deepspeed" \ -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \ -t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ diff --git a/dockerfile_scripts/install_deepspeed.sh b/dockerfile_scripts/install_deepspeed.sh index 623cd884..1ac14e0f 100755 --- a/dockerfile_scripts/install_deepspeed.sh +++ b/dockerfile_scripts/install_deepspeed.sh @@ -7,3 +7,24 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev python -m pip install triton==1.0.0 DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed python -m deepspeed.env_report + +if [[ "$DEEPSPEED_PIP" == *"EleutherAI"* ]]; then + # This is a dependency of gpt-neox + apt-get install -y mpich + # Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894. + pip install setuptools==59.5.0 + # Install gpt-neox and dependencies + git clone -b determined2 https://github.com/determined-ai/gpt-neox.git + python gpt-neox/megatron/fused_kernels/setup.py install + + # Exclude DeeperSpeed reinstall since the version in requirements is not pinned. + pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt) + pip install -r /gpt-neox/requirements/requirements-flashattention.txt + + # Download sample data + gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data + + # Modify permissions to enable example to run in nonroot mode + chmod -R 777 /gpt-neox + chmod -R 777 /tmp +fi