Skip to content

Commit

Permalink
Add EFA NCCL test case, unmanaged nodegroup template (#427)
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermckinnon authored Feb 24, 2024
1 parent 416649d commit a2fee67
Show file tree
Hide file tree
Showing 9 changed files with 536 additions and 24 deletions.
2 changes: 2 additions & 0 deletions e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ spec:
- p4de.24xlarge
- trn1.32xlarge
- trn1n.32xlarge
- p5.48xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
Expand All @@ -75,6 +76,7 @@ spec:
- p4de.24xlarge
- trn1.32xlarge
- trn1n.32xlarge
- p5.48xlarge
hostNetwork: true
containers:
- image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.3.3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,21 @@ metadata:
spec:
slotsPerWorker: 8
runPolicy:
# it may take a bit for the workers to get ready (the container image is heavy)
# and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime
backoffLimit: 20
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
restartPolicy: OnFailure
initContainers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
name: init
command: ["sh", "-c", "sleep 5"]
containers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
- image: TODO
imagePullPolicy: Always
name: nccl-test-launcher
env:
- name: LD_LIBRARY_PATH
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
- name: PATH
value: $PATH:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin
- name: XLA_FLAGS
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
- name: TF_XLA_FLAGS
Expand Down Expand Up @@ -55,11 +50,13 @@ spec:
- RDMAV_FORK_SAFE=1
- -x
- NCCL_PROTO=simple
- -x
- -x
- FI_LOG_LEVEL=warn
- -x
- FI_EFA_USE_DEVICE_RDMA=1
- -x
- OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
- -x
- NCCL_PROTO=simple
- --mca
- pml
Expand Down Expand Up @@ -88,10 +85,8 @@ spec:
- name: dshm
emptyDir:
medium: Memory
nodeSelector:
beta.kubernetes.io/instance-type: p4d.24xlarge
containers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
- image: TODO
imagePullPolicy: Always
name: nccl-worker
volumeMounts:
Expand Down
4 changes: 2 additions & 2 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ func TestMPIJobPytorchTraining(t *testing.T) {
t.Fatal(err)
}
j := kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node"},
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node", Namespace: "default"},
}
timeout := time.Minute * 20
timeout := time.Minute * 10
err := wait.For(conditions.New(rsrc).ResourceMatch(&j, mpiJobSucceeded),
wait.WithTimeout(timeout))
if err != nil {
Expand Down
92 changes: 92 additions & 0 deletions e2e2/test/images/Dockerfile.aws-efa-nccl-tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.7.4
ARG NCCL_TESTS_VERSION=master

# Install necessary dependencies
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 \
ibverbs-utils \
libibverbs-dev \
libibverbs1 \
libnccl2 \
libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
wget curl \
autoconf \
libtool \
gdb \
automake \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config \
libhwloc-dev

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer

# Install NCCL
RUN apt-key del 7fa2af80 \
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

## Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-mpi=/opt/amazon/openmpi/ \
&& make && make install

# Install NCCL Tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda

# Set a default command for debugging or modify as per requirements
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD
4 changes: 4 additions & 0 deletions kubetest2/internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type deployerOptions struct {
Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."`
AMI string `flag:"ami" desc:"AMI for nodes"`
ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"`
EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."`
EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"`
EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"`
ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."`
Expand Down Expand Up @@ -256,6 +257,9 @@ func (d *deployer) verifyUpFlags() error {
d.UserDataFormat = "bootstrap.sh"
klog.V(2).Infof("Using default user data format: %s", d.UserDataFormat)
}
if d.UnmanagedNodes && d.EFA && len(d.InstanceTypes) != 1 {
return fmt.Errorf("--efa requires a single instance type")
}
if d.NodeReadyTimeout == 0 {
d.NodeReadyTimeout = time.Minute * 5
}
Expand Down
89 changes: 89 additions & 0 deletions kubetest2/internal/deployers/eksapi/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ func NewNodegroupManager(clients *awsClients, resourceID string) *NodegroupManag

func (m *NodegroupManager) createNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
if opts.UnmanagedNodes {
if opts.EFA {
return m.createUnmanagedNodegroupWithEFA(infra, cluster, opts)
}
return m.createUnmanagedNodegroup(infra, cluster, opts)
} else {
return m.createManagedNodegroup(infra, cluster, opts)
Expand Down Expand Up @@ -194,6 +197,92 @@ func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, clust
return nil
}

func (m *NodegroupManager) createUnmanagedNodegroupWithEFA(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
stackName := m.getUnmanagedNodegroupStackName()
klog.Infof("creating unmanaged nodegroup with EFA stack...")
userData, err := generateUserData(opts.UserDataFormat, cluster)
if err != nil {
return err
}
// pull the role name out of the ARN
nodeRoleArnParts := strings.Split(infra.nodeRole, "/")
nodeRoleName := nodeRoleArnParts[len(nodeRoleArnParts)-1]
input := cloudformation.CreateStackInput{
StackName: aws.String(stackName),
TemplateBody: aws.String(templates.UnmanagedNodegroupEFA),
Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityIam},
Parameters: []cloudformationtypes.Parameter{
{
ParameterKey: aws.String("ResourceId"),
ParameterValue: aws.String(m.resourceID),
},
{
ParameterKey: aws.String("VpcId"),
ParameterValue: aws.String(infra.vpc),
},
{
ParameterKey: aws.String("SubnetIds"),
ParameterValue: aws.String(infra.subnetsPrivate[0]), // this is load bearing! EFA requires a private subnet
},
{
ParameterKey: aws.String("UserData"),
ParameterValue: aws.String(userData),
},
{
ParameterKey: aws.String("ClusterName"),
ParameterValue: aws.String(cluster.name),
},
{
ParameterKey: aws.String("NodeRoleName"),
ParameterValue: aws.String(nodeRoleName),
},
{
ParameterKey: aws.String("NodeCount"),
ParameterValue: aws.String(strconv.Itoa(opts.Nodes)),
},
{
ParameterKey: aws.String("SecurityGroup"),
ParameterValue: aws.String(cluster.securityGroupId),
},
{
ParameterKey: aws.String("SSHKeyPair"),
ParameterValue: aws.String(infra.sshKeyPair),
},
{
ParameterKey: aws.String("AMIId"),
ParameterValue: aws.String(opts.AMI),
},
{
ParameterKey: aws.String("InstanceType"),
ParameterValue: aws.String(opts.InstanceTypes[0]),
},
},
}
out, err := m.clients.CFN().CreateStack(context.TODO(), &input)
if err != nil {
return err
}
klog.Infof("waiting for unmanaged nodegroup with EFA to be created: %s", *out.StackId)
err = cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()).
Wait(context.TODO(),
&cloudformation.DescribeStacksInput{
StackName: out.StackId,
},
infraStackCreationTimeout)
if err != nil {
return fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err)
}
klog.Infof("created unmanaged nodegroup with EFA stack: %s", *out.StackId)
if opts.ExpectedAMI != "" {
if ok, err := m.verifyASGAMI(m.resourceID, opts.ExpectedAMI); err != nil {
return err
} else if !ok {
return fmt.Errorf("ASG %s is not using expected AMI: %s", m.resourceID, opts.ExpectedAMI)
}
}
return nil
}

func (m *NodegroupManager) deleteNodegroup() error {
if err := m.deleteUnmanagedNodegroup(); err != nil {
return err
Expand Down
3 changes: 3 additions & 0 deletions kubetest2/internal/deployers/eksapi/templates/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import (
//go:embed infra.yaml
var Infrastructure string

//go:embed unmanaged-nodegroup-efa.yaml
var UnmanagedNodegroupEFA string

var (
//go:embed unmanaged-nodegroup.yaml.template
unmanagedNodegroupTemplate string
Expand Down
Loading

0 comments on commit a2fee67

Please sign in to comment.