-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add EFA NCCL test case, unmanaged nodegroup template #427
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,26 +5,21 @@ metadata: | |
spec: | ||
slotsPerWorker: 8 | ||
runPolicy: | ||
# it may take a bit for the workers to get ready (the container image is heavy) | ||
# and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime | ||
backoffLimit: 20 | ||
cleanPodPolicy: Running | ||
mpiReplicaSpecs: | ||
Launcher: | ||
replicas: 1 | ||
template: | ||
spec: | ||
restartPolicy: OnFailure | ||
initContainers: | ||
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 | ||
name: init | ||
command: ["sh", "-c", "sleep 5"] | ||
containers: | ||
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 | ||
- image: TODO | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Temporary until there's an build pipeline set up for the image added in this PR. |
||
imagePullPolicy: Always | ||
name: nccl-test-launcher | ||
env: | ||
- name: LD_LIBRARY_PATH | ||
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH | ||
- name: PATH | ||
value: $PATH:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin | ||
- name: XLA_FLAGS | ||
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" | ||
- name: TF_XLA_FLAGS | ||
|
@@ -55,11 +50,13 @@ spec: | |
- RDMAV_FORK_SAFE=1 | ||
- -x | ||
- NCCL_PROTO=simple | ||
- -x | ||
- -x | ||
- FI_LOG_LEVEL=warn | ||
- -x | ||
- FI_EFA_USE_DEVICE_RDMA=1 | ||
- -x | ||
- OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is important, it will fail the test case if something goes wrong with the EFA RDMA instead of falling back to the dirt-slow host buffer copy. |
||
- -x | ||
- NCCL_PROTO=simple | ||
- --mca | ||
- pml | ||
|
@@ -88,10 +85,8 @@ spec: | |
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
nodeSelector: | ||
beta.kubernetes.io/instance-type: p4d.24xlarge | ||
containers: | ||
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 | ||
- image: TODO | ||
imagePullPolicy: Always | ||
name: nccl-worker | ||
volumeMounts: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 | ||
|
||
ARG EFA_INSTALLER_VERSION=latest | ||
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 | ||
ARG AWS_OFI_NCCL_VERSION=1.7.4 | ||
ARG NCCL_TESTS_VERSION=master | ||
|
||
# Install necessary dependencies | ||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 \ | ||
ibverbs-utils \ | ||
libibverbs-dev \ | ||
libibverbs1 \ | ||
libnccl2 \ | ||
libnccl-dev | ||
|
||
RUN rm -rf /opt/hpcx \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -rf /usr/local/ucx \ | ||
&& rm -f /etc/ld.so.conf.d/hpcx.conf \ | ||
&& ldconfig | ||
|
||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
sudo \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
wget curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
python3-distutils \ | ||
cmake \ | ||
apt-utils \ | ||
devscripts \ | ||
debhelper \ | ||
libsubunit-dev \ | ||
check \ | ||
pkg-config \ | ||
libhwloc-dev | ||
|
||
RUN mkdir -p /var/run/sshd | ||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ | ||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ | ||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config | ||
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH | ||
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH | ||
|
||
# Install EFA | ||
RUN cd $HOME \ | ||
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& cd aws-efa-installer \ | ||
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ | ||
&& rm -rf $HOME/aws-efa-installer | ||
|
||
# Install NCCL | ||
RUN apt-key del 7fa2af80 \ | ||
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ | ||
&& dpkg -i cuda-keyring_1.0-1_all.deb \ | ||
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 | ||
|
||
## Install AWS-OFI-NCCL plugin | ||
RUN export OPAL_PREFIX="" \ | ||
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ | ||
&& cd /opt/aws-ofi-nccl \ | ||
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \ | ||
&& ./autogen.sh \ | ||
&& ./configure --prefix=/opt/aws-ofi-nccl/install \ | ||
--with-libfabric=/opt/amazon/efa/ \ | ||
--with-cuda=/usr/local/cuda \ | ||
--with-mpi=/opt/amazon/openmpi/ \ | ||
&& make && make install | ||
|
||
# Install NCCL Tests | ||
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ | ||
&& cd /opt/nccl-tests \ | ||
&& git checkout ${NCCL_TESTS_VERSION} \ | ||
&& make MPI=1 \ | ||
MPI_HOME=/opt/amazon/openmpi/ \ | ||
CUDA_HOME=/usr/local/cuda | ||
|
||
# Set a default command for debugging or modify as per requirements | ||
ENV NCCL_PROTO simple | ||
RUN rm -rf /var/lib/apt/lists/* | ||
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we have some control over it so that we can easily tune/stabilize the test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is adding p5 types to the nodeSelector for the EFA device plugin, it's just a pre-req for using p5's.