Skip to content

Commit

Permalink
Fix Nvidia Image build (#489)
Browse files Browse the repository at this point in the history
  • Loading branch information
Issacwww authored Oct 9, 2024
1 parent 2125490 commit 7d8a797
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 23 deletions.
18 changes: 10 additions & 8 deletions e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ import (
)

var (
testenv env.Environment
nodeType *string
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
testenv env.Environment
nodeType *string
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
skipUnitTestSubcommand *string
nodeCount int
gpuPerNode int
efaPerNode int
)

var (
Expand Down Expand Up @@ -134,6 +135,7 @@ func TestMain(m *testing.M) {
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
skipUnitTestSubcommand = flag.String("skipUnitTestSubcommand", "", "optional command to skip specified unit test, `-s test1|test2|...`")
cfg, err := envconf.NewFromFlags()
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ spec:
command:
- /bin/bash
- ./gpu_unit_tests/unit_test
env:
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestCommand}}
imagePullPolicy: Always
resources:
limits:
Expand Down
8 changes: 6 additions & 2 deletions e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ var (
)

type unitTestManifestTplVars struct {
NvidiaTestImage string
NvidiaTestImage string
SkipTestSubcommand string
GpuPerNode int
}

type hpcTestManifestTplVars struct {
Expand All @@ -42,7 +44,9 @@ func TestSingleNodeUnitTest(t *testing.T) {
}
var err error
renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{
NvidiaTestImage: *nvidiaTestImage,
NvidiaTestImage: *nvidiaTestImage,
SkipTestSubcommand: *skipUnitTestSubcommand,
GpuPerNode: gpuPerNode,
})
if err != nil {
t.Fatal(err)
Expand Down
20 changes: 10 additions & 10 deletions e2e2/test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=5

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}.1-devel-ubuntu${UBUNTU_MAJOR_VERSION}.04
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.1-devel-ubuntu$UBUNTU_MAJOR_VERSION.04

ARG UBUNTU_MAJOR_VERSION
ARG CUDA_MAJOR_VERSION
Expand Down Expand Up @@ -41,7 +41,7 @@ RUN apt install -y \
cmake \
apt-utils \
libhwloc-dev \
cuda-demo-suite-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
cuda-demo-suite-$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION \
datacenter-gpu-manager

RUN mkdir -p /var/run/sshd \
Expand All @@ -55,24 +55,24 @@ ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sb
# Install EFA
ARG EFA_INSTALLER_VERSION=latest
RUN cd /tmp \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xvz \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz \
&& cd aws-efa-installer \
&& ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi5 \
&& rm -rf /tmp/* \
/var/lib/apt/lists/*

# Install NCCL
ARG NCCL_VERSION=2.22.3-1+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}
ARG NCCL_VERSION=2.22.3-1+cuda12.5
RUN apt update \
&& apt install -y \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION}
libnccl2=$NCCL_VERSION \
libnccl-dev=$NCCL_VERSION

# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.11.0-aws
RUN cd tmp \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz | tar xvz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz \
&& cd aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
Expand All @@ -85,8 +85,8 @@ RUN cd tmp \
# Install NCCL Tests
ARG NCCL_TESTS_VERSION=2.13.10
RUN cd /tmp \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz | tar xvz \
&& cd nccl-tests-${NCCL_TESTS_VERSION} \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v$NCCL_TESTS_VERSION.tar.gz | tar xvz \
&& cd nccl-tests-$NCCL_TESTS_VERSION \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi5/ \
CUDA_HOME=/usr/local/cuda \
Expand Down
13 changes: 11 additions & 2 deletions e2e2/test/images/nvidia/gpu_unit_tests/bash_unit
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ run_tests() {
declare -F | "$GREP" ' setup$' >/dev/null && setup
__bash_unit_test_skipped__=$(mktemp)
trap "$RM -f \"$stdout\" \"$stderr\"" EXIT
if [[ -n "$skip_pattern" && ("$test" =~ $skip_pattern) ]]; then
skip "$test as specified in skip pattern: $skip_pattern"
fi
(__bash_unit_current_test__="$test" run_test) || status=$?
test -s $__bash_unit_test_skipped__ && status=0
declare -F | "$GREP" ' teardown$' >/dev/null && teardown
Expand All @@ -311,9 +314,10 @@ run_teardown_suite() {

usage() {
echo "$1" >&2
echo "$0 [-f <output format>] [-p <pattern1>] [-p <pattern2>] [-r] ... <test_file1> <test_file2> ..." >&2
echo "$0 [-f <output format>] [-p <pattern1>] [-p <pattern2>] [-s <skip_pattern>] [-r] ... <test_file1> <test_file2> ..." >&2
echo >&2
echo "Runs tests in test files that match <pattern>s" >&2
echo "Skip tests in test files that match <skip_pattern>s" >&2
echo "<output format> is optional only supported value is tap" >&2
echo "-r to execute test cases in random order" >&2
echo "-v to get current version information" >&2
Expand Down Expand Up @@ -533,16 +537,21 @@ tap_format() {

output_format=text
test_pattern=""
skip_pattern=""
trace_file=""
separator=""
randomise=0
while getopts "vp:t:f:r" option
while getopts "vp:t:f:r:s" option
do
case "$option" in
p)
test_pattern="${test_pattern}${separator}${OPTARG}"
separator="|"
;;
s)
skip_pattern="${skip_pattern}${separator}${OPTARG}"
separator="|"
;;
t)
trace_file="$(realpath ${OPTARG})"
truncate -s0 "$trace_file"
Expand Down
3 changes: 2 additions & 1 deletion e2e2/test/images/nvidia/gpu_unit_tests/unit_test
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ TRACE_LOG=trace.log
TEST_TIMEOUT=1800
BASH="/usr/bin/bash"
CURRENT_DIR=$(pwd)
SKIP_TESTS_SUBCOMMAND=${SKIP_TESTS_SUBCOMMAND:-""}

timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh
timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap ${SKIP_TESTS_SUBCOMMAND} -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh

0 comments on commit 7d8a797

Please sign in to comment.