Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EFA NCCL test case, unmanaged nodegroup template #427

Merged
merged 1 commit into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ spec:
- p4de.24xlarge
- trn1.32xlarge
- trn1n.32xlarge
- p5.48xlarge
Copy link
Contributor

@Issacwww Issacwww Feb 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have some control over it so that we can easily tune/stabilize the test?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is adding p5 types to the nodeSelector for the EFA device plugin, it's just a pre-req for using p5's.

- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
Expand All @@ -75,6 +76,7 @@ spec:
- p4de.24xlarge
- trn1.32xlarge
- trn1n.32xlarge
- p5.48xlarge
hostNetwork: true
containers:
- image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.3.3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,21 @@ metadata:
spec:
slotsPerWorker: 8
runPolicy:
# it may take a bit for the workers to get ready (the container image is heavy)
# and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime
backoffLimit: 20
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
restartPolicy: OnFailure
initContainers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
name: init
command: ["sh", "-c", "sleep 5"]
containers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
- image: TODO
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporary until there's an build pipeline set up for the image added in this PR.

imagePullPolicy: Always
name: nccl-test-launcher
env:
- name: LD_LIBRARY_PATH
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
- name: PATH
value: $PATH:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin
- name: XLA_FLAGS
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
- name: TF_XLA_FLAGS
Expand Down Expand Up @@ -55,11 +50,13 @@ spec:
- RDMAV_FORK_SAFE=1
- -x
- NCCL_PROTO=simple
- -x
- -x
- FI_LOG_LEVEL=warn
- -x
- FI_EFA_USE_DEVICE_RDMA=1
- -x
- OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is important, it will fail the test case if something goes wrong with the EFA RDMA instead of falling back to the dirt-slow host buffer copy.

- -x
- NCCL_PROTO=simple
- --mca
- pml
Expand Down Expand Up @@ -88,10 +85,8 @@ spec:
- name: dshm
emptyDir:
medium: Memory
nodeSelector:
beta.kubernetes.io/instance-type: p4d.24xlarge
containers:
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3
- image: TODO
imagePullPolicy: Always
name: nccl-worker
volumeMounts:
Expand Down
4 changes: 2 additions & 2 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ func TestMPIJobPytorchTraining(t *testing.T) {
t.Fatal(err)
}
j := kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node"},
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node", Namespace: "default"},
}
timeout := time.Minute * 20
timeout := time.Minute * 10
err := wait.For(conditions.New(rsrc).ResourceMatch(&j, mpiJobSucceeded),
wait.WithTimeout(timeout))
if err != nil {
Expand Down
92 changes: 92 additions & 0 deletions e2e2/test/images/Dockerfile.aws-efa-nccl-tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.7.4
ARG NCCL_TESTS_VERSION=master

# Install necessary dependencies
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 \
ibverbs-utils \
libibverbs-dev \
libibverbs1 \
libnccl2 \
libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
wget curl \
autoconf \
libtool \
gdb \
automake \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config \
libhwloc-dev

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer

# Install NCCL
RUN apt-key del 7fa2af80 \
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

## Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-mpi=/opt/amazon/openmpi/ \
&& make && make install

# Install NCCL Tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda

# Set a default command for debugging or modify as per requirements
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD
4 changes: 4 additions & 0 deletions kubetest2/internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type deployerOptions struct {
Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."`
AMI string `flag:"ami" desc:"AMI for nodes"`
ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"`
EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."`
EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"`
EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"`
ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."`
Expand Down Expand Up @@ -256,6 +257,9 @@ func (d *deployer) verifyUpFlags() error {
d.UserDataFormat = "bootstrap.sh"
klog.V(2).Infof("Using default user data format: %s", d.UserDataFormat)
}
if d.UnmanagedNodes && d.EFA && len(d.InstanceTypes) != 1 {
return fmt.Errorf("--efa requires a single instance type")
}
if d.NodeReadyTimeout == 0 {
d.NodeReadyTimeout = time.Minute * 5
}
Expand Down
89 changes: 89 additions & 0 deletions kubetest2/internal/deployers/eksapi/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ func NewNodegroupManager(clients *awsClients, resourceID string) *NodegroupManag

func (m *NodegroupManager) createNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
if opts.UnmanagedNodes {
if opts.EFA {
return m.createUnmanagedNodegroupWithEFA(infra, cluster, opts)
}
return m.createUnmanagedNodegroup(infra, cluster, opts)
} else {
return m.createManagedNodegroup(infra, cluster, opts)
Expand Down Expand Up @@ -194,6 +197,92 @@ func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, clust
return nil
}

func (m *NodegroupManager) createUnmanagedNodegroupWithEFA(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
stackName := m.getUnmanagedNodegroupStackName()
klog.Infof("creating unmanaged nodegroup with EFA stack...")
userData, err := generateUserData(opts.UserDataFormat, cluster)
if err != nil {
return err
}
// pull the role name out of the ARN
nodeRoleArnParts := strings.Split(infra.nodeRole, "/")
nodeRoleName := nodeRoleArnParts[len(nodeRoleArnParts)-1]
input := cloudformation.CreateStackInput{
StackName: aws.String(stackName),
TemplateBody: aws.String(templates.UnmanagedNodegroupEFA),
Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityIam},
Parameters: []cloudformationtypes.Parameter{
{
ParameterKey: aws.String("ResourceId"),
ParameterValue: aws.String(m.resourceID),
},
{
ParameterKey: aws.String("VpcId"),
ParameterValue: aws.String(infra.vpc),
},
{
ParameterKey: aws.String("SubnetIds"),
ParameterValue: aws.String(infra.subnetsPrivate[0]), // this is load bearing! EFA requires a private subnet
},
{
ParameterKey: aws.String("UserData"),
ParameterValue: aws.String(userData),
},
{
ParameterKey: aws.String("ClusterName"),
ParameterValue: aws.String(cluster.name),
},
{
ParameterKey: aws.String("NodeRoleName"),
ParameterValue: aws.String(nodeRoleName),
},
{
ParameterKey: aws.String("NodeCount"),
ParameterValue: aws.String(strconv.Itoa(opts.Nodes)),
},
{
ParameterKey: aws.String("SecurityGroup"),
ParameterValue: aws.String(cluster.securityGroupId),
},
{
ParameterKey: aws.String("SSHKeyPair"),
ParameterValue: aws.String(infra.sshKeyPair),
},
{
ParameterKey: aws.String("AMIId"),
ParameterValue: aws.String(opts.AMI),
},
{
ParameterKey: aws.String("InstanceType"),
ParameterValue: aws.String(opts.InstanceTypes[0]),
},
},
}
out, err := m.clients.CFN().CreateStack(context.TODO(), &input)
if err != nil {
return err
}
klog.Infof("waiting for unmanaged nodegroup with EFA to be created: %s", *out.StackId)
err = cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()).
Wait(context.TODO(),
&cloudformation.DescribeStacksInput{
StackName: out.StackId,
},
infraStackCreationTimeout)
if err != nil {
return fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err)
}
klog.Infof("created unmanaged nodegroup with EFA stack: %s", *out.StackId)
if opts.ExpectedAMI != "" {
if ok, err := m.verifyASGAMI(m.resourceID, opts.ExpectedAMI); err != nil {
return err
} else if !ok {
return fmt.Errorf("ASG %s is not using expected AMI: %s", m.resourceID, opts.ExpectedAMI)
}
}
return nil
}

func (m *NodegroupManager) deleteNodegroup() error {
if err := m.deleteUnmanagedNodegroup(); err != nil {
return err
Expand Down
3 changes: 3 additions & 0 deletions kubetest2/internal/deployers/eksapi/templates/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import (
//go:embed infra.yaml
var Infrastructure string

//go:embed unmanaged-nodegroup-efa.yaml
var UnmanagedNodegroupEFA string

var (
//go:embed unmanaged-nodegroup.yaml.template
unmanagedNodegroupTemplate string
Expand Down
Loading
Loading