Skip to content

Commit

Permalink
chore: Update GPU Dockerfile versions
Browse files Browse the repository at this point in the history
  • Loading branch information
bryantbiggs committed Oct 4, 2024
1 parent ba69695 commit 1dbcf1c
Showing 1 changed file with 74 additions and 73 deletions.
147 changes: 74 additions & 73 deletions e2e2/test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
ARG UBUNTU_MAJOR_VERSION=22

ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=5

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
FROM nvidia/cuda:${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}.1-devel-ubuntu${UBUNTU_MAJOR_VERSION}.04

ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG NCCL_TESTS_VERSION=master
ARG UBUNTU_MAJOR_VERSION
ARG CUDA_MAJOR_VERSION
ARG CUDA_MINOR_VERSION

ENV DEBIAN_FRONTEND=noninteractive

# Install necessary dependencies
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
RUN apt update -y \
&& apt remove -y --allow-change-held-packages \
libmlx5-1 \
ibverbs-utils \
libibverbs-dev \
Expand All @@ -17,84 +23,79 @@ RUN apt-get remove -y --allow-change-held-packages \
libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
wget curl \
autoconf \
libtool \
gdb \
automake \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config \
libhwloc-dev \
datacenter-gpu-manager \
cloud-utils \
cuda-demo-suite-12-5
RUN apt install -y \
git \
gcc \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
automake \
cmake \
apt-utils \
libhwloc-dev \
cuda-demo-suite-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
RUN mkdir -p /var/run/sshd \
&& sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer
ARG EFA_INSTALLER_VERSION=latest
RUN cd /tmp \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xvz \
&& cd aws-efa-installer \
&& ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi5 \
&& rm -rf /tmp/* \
/var/lib/apt/lists/*

# Install NCCL
RUN apt-key del 7fa2af80 \
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
ARG NCCL_VERSION=2.22.3-1+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}
RUN apt update \
&& apt install -y \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION}

## Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-mpi=/opt/amazon/openmpi/ \
&& make && make install

# Install NCCL Tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda
# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.11.0-aws
RUN cd tmp \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz | tar xvz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--disable-tests \
&& make -j $(nproc) \
&& make install

# Install NCCL Tests
ARG NCCL_TESTS_VERSION=2.13.10
RUN cd /tmp \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz | tar xvz \
&& cd nccl-tests-${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi5/ \
CUDA_HOME=/usr/local/cuda \
&& mkdir -p /opt/nccl-tests \
&& cp -r build /opt/nccl-tests/build \
&& rm -rf /tmp/*

# Set a default command for debugging or modify as per requirements
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD

COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests
RUN chmod +x ./gpu_unit_tests/unit_test
RUN chmod +x ./gpu_unit_tests/unit_test

0 comments on commit 1dbcf1c

Please sign in to comment.