diff --git a/e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml b/e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml index 774ea1df5..57185cbde 100644 --- a/e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml +++ b/e2e2/test/cases/nvidia/manifests/efa-device-plugin.yaml @@ -52,6 +52,7 @@ spec: - p4de.24xlarge - trn1.32xlarge - trn1n.32xlarge + - p5.48xlarge - matchExpressions: - key: "node.kubernetes.io/instance-type" operator: In @@ -75,6 +76,7 @@ spec: - p4de.24xlarge - trn1.32xlarge - trn1n.32xlarge + - p5.48xlarge hostNetwork: true containers: - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.3.3 diff --git a/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-multi-node.yaml b/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-multi-node.yaml index aa92d9c28..722cb3da1 100644 --- a/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-multi-node.yaml +++ b/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-multi-node.yaml @@ -5,6 +5,9 @@ metadata: spec: slotsPerWorker: 8 runPolicy: + # it may take a bit for the workers to get ready (the container image is heavy) + # and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime + backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: @@ -12,19 +15,11 @@ spec: template: spec: restartPolicy: OnFailure - initContainers: - - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 - name: init - command: ["sh", "-c", "sleep 5"] containers: - - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 + - image: TODO imagePullPolicy: Always name: nccl-test-launcher env: - - name: LD_LIBRARY_PATH - value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - - name: PATH - value: $PATH:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin - name: XLA_FLAGS value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" - name: TF_XLA_FLAGS @@ -55,11 +50,13 @@ spec: - RDMAV_FORK_SAFE=1 - -x - NCCL_PROTO=simple - - -x + - -x - FI_LOG_LEVEL=warn - -x - FI_EFA_USE_DEVICE_RDMA=1 - -x + - OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 + - -x - NCCL_PROTO=simple - --mca - pml @@ -88,10 +85,8 @@ spec: - name: dshm emptyDir: medium: Memory - nodeSelector: - beta.kubernetes.io/instance-type: p4d.24xlarge containers: - - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:22.03-pt-py3 + - image: TODO imagePullPolicy: Always name: nccl-worker volumeMounts: diff --git a/e2e2/test/cases/nvidia/mpi_test.go b/e2e2/test/cases/nvidia/mpi_test.go index 7c642fa16..423519b52 100644 --- a/e2e2/test/cases/nvidia/mpi_test.go +++ b/e2e2/test/cases/nvidia/mpi_test.go @@ -98,9 +98,9 @@ func TestMPIJobPytorchTraining(t *testing.T) { t.Fatal(err) } j := kubeflowv2beta1.MPIJob{ - ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node"}, + ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-multi-node", Namespace: "default"}, } - timeout := time.Minute * 20 + timeout := time.Minute * 10 err := wait.For(conditions.New(rsrc).ResourceMatch(&j, mpiJobSucceeded), wait.WithTimeout(timeout)) if err != nil { diff --git a/e2e2/test/images/Dockerfile.aws-efa-nccl-tests b/e2e2/test/images/Dockerfile.aws-efa-nccl-tests new file mode 100644 index 000000000..0b2b638f8 --- /dev/null +++ b/e2e2/test/images/Dockerfile.aws-efa-nccl-tests @@ -0,0 +1,92 @@ +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 + +ARG EFA_INSTALLER_VERSION=latest +# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 +ARG AWS_OFI_NCCL_VERSION=1.7.4 +ARG NCCL_TESTS_VERSION=master + +# Install necessary dependencies +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -rf /usr/local/ucx \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + sudo \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + wget curl \ + autoconf \ + libtool \ + gdb \ + automake \ + python3-distutils \ + cmake \ + apt-utils \ + devscripts \ + debhelper \ + libsubunit-dev \ + check \ + pkg-config \ + libhwloc-dev + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config +ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH + +# Install EFA +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +# Install NCCL +RUN apt-key del 7fa2af80 \ + && curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ + && dpkg -i cuda-keyring_1.0-1_all.deb \ + && sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 + +## Install AWS-OFI-NCCL plugin +RUN export OPAL_PREFIX="" \ + && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout v${AWS_OFI_NCCL_VERSION}-aws \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + --with-mpi=/opt/amazon/openmpi/ \ + && make && make install + +# Install NCCL Tests +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout ${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda + +# Set a default command for debugging or modify as per requirements +ENV NCCL_PROTO simple +RUN rm -rf /var/lib/apt/lists/* +ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD \ No newline at end of file diff --git a/kubetest2/internal/deployers/eksapi/deployer.go b/kubetest2/internal/deployers/eksapi/deployer.go index 44dd367c0..a95e16950 100644 --- a/kubetest2/internal/deployers/eksapi/deployer.go +++ b/kubetest2/internal/deployers/eksapi/deployer.go @@ -64,6 +64,7 @@ type deployerOptions struct { Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."` AMI string `flag:"ami" desc:"AMI for nodes"` ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"` + EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."` EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"` EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"` ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."` @@ -256,6 +257,9 @@ func (d *deployer) verifyUpFlags() error { d.UserDataFormat = "bootstrap.sh" klog.V(2).Infof("Using default user data format: %s", d.UserDataFormat) } + if d.UnmanagedNodes && d.EFA && len(d.InstanceTypes) != 1 { + return fmt.Errorf("--efa requires a single instance type") + } if d.NodeReadyTimeout == 0 { d.NodeReadyTimeout = time.Minute * 5 } diff --git a/kubetest2/internal/deployers/eksapi/nodegroup.go b/kubetest2/internal/deployers/eksapi/nodegroup.go index 90ec0aabf..2832ce9af 100644 --- a/kubetest2/internal/deployers/eksapi/nodegroup.go +++ b/kubetest2/internal/deployers/eksapi/nodegroup.go @@ -41,6 +41,9 @@ func NewNodegroupManager(clients *awsClients, resourceID string) *NodegroupManag func (m *NodegroupManager) createNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { if opts.UnmanagedNodes { + if opts.EFA { + return m.createUnmanagedNodegroupWithEFA(infra, cluster, opts) + } return m.createUnmanagedNodegroup(infra, cluster, opts) } else { return m.createManagedNodegroup(infra, cluster, opts) @@ -194,6 +197,92 @@ func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, clust return nil } +func (m *NodegroupManager) createUnmanagedNodegroupWithEFA(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { + stackName := m.getUnmanagedNodegroupStackName() + klog.Infof("creating unmanaged nodegroup with EFA stack...") + userData, err := generateUserData(opts.UserDataFormat, cluster) + if err != nil { + return err + } + // pull the role name out of the ARN + nodeRoleArnParts := strings.Split(infra.nodeRole, "/") + nodeRoleName := nodeRoleArnParts[len(nodeRoleArnParts)-1] + input := cloudformation.CreateStackInput{ + StackName: aws.String(stackName), + TemplateBody: aws.String(templates.UnmanagedNodegroupEFA), + Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityIam}, + Parameters: []cloudformationtypes.Parameter{ + { + ParameterKey: aws.String("ResourceId"), + ParameterValue: aws.String(m.resourceID), + }, + { + ParameterKey: aws.String("VpcId"), + ParameterValue: aws.String(infra.vpc), + }, + { + ParameterKey: aws.String("SubnetIds"), + ParameterValue: aws.String(infra.subnetsPrivate[0]), // this is load bearing! EFA requires a private subnet + }, + { + ParameterKey: aws.String("UserData"), + ParameterValue: aws.String(userData), + }, + { + ParameterKey: aws.String("ClusterName"), + ParameterValue: aws.String(cluster.name), + }, + { + ParameterKey: aws.String("NodeRoleName"), + ParameterValue: aws.String(nodeRoleName), + }, + { + ParameterKey: aws.String("NodeCount"), + ParameterValue: aws.String(strconv.Itoa(opts.Nodes)), + }, + { + ParameterKey: aws.String("SecurityGroup"), + ParameterValue: aws.String(cluster.securityGroupId), + }, + { + ParameterKey: aws.String("SSHKeyPair"), + ParameterValue: aws.String(infra.sshKeyPair), + }, + { + ParameterKey: aws.String("AMIId"), + ParameterValue: aws.String(opts.AMI), + }, + { + ParameterKey: aws.String("InstanceType"), + ParameterValue: aws.String(opts.InstanceTypes[0]), + }, + }, + } + out, err := m.clients.CFN().CreateStack(context.TODO(), &input) + if err != nil { + return err + } + klog.Infof("waiting for unmanaged nodegroup with EFA to be created: %s", *out.StackId) + err = cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()). + Wait(context.TODO(), + &cloudformation.DescribeStacksInput{ + StackName: out.StackId, + }, + infraStackCreationTimeout) + if err != nil { + return fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err) + } + klog.Infof("created unmanaged nodegroup with EFA stack: %s", *out.StackId) + if opts.ExpectedAMI != "" { + if ok, err := m.verifyASGAMI(m.resourceID, opts.ExpectedAMI); err != nil { + return err + } else if !ok { + return fmt.Errorf("ASG %s is not using expected AMI: %s", m.resourceID, opts.ExpectedAMI) + } + } + return nil +} + func (m *NodegroupManager) deleteNodegroup() error { if err := m.deleteUnmanagedNodegroup(); err != nil { return err diff --git a/kubetest2/internal/deployers/eksapi/templates/templates.go b/kubetest2/internal/deployers/eksapi/templates/templates.go index eab743f33..a1bb4b4c0 100644 --- a/kubetest2/internal/deployers/eksapi/templates/templates.go +++ b/kubetest2/internal/deployers/eksapi/templates/templates.go @@ -8,6 +8,9 @@ import ( //go:embed infra.yaml var Infrastructure string +//go:embed unmanaged-nodegroup-efa.yaml +var UnmanagedNodegroupEFA string + var ( //go:embed unmanaged-nodegroup.yaml.template unmanagedNodegroupTemplate string diff --git a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml new file mode 100644 index 000000000..59a271e0c --- /dev/null +++ b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml @@ -0,0 +1,336 @@ +--- +AWSTemplateFormatVersion: '2010-09-09' +Description: 'kubetest2-eksapi unmanaged nodegroup with EFA support' + +Parameters: + ResourceId: + Description: Unique identifier for this kubetest2-eksapi execution. + Type: String + + VpcId: + Type: AWS::EC2::VPC::Id + + SubnetIds: + Type: List + + SecurityGroup: + Type: AWS::EC2::SecurityGroup::Id + + AMIId: + Type: String + Description: Specify AMI id for the node instances. + + NodeDiskSize: + Type: Number + Description: Node disk size in gigabytes. + Default: 100 + + NodeCount: + Type: Number + + ClusterName: + Type: String + + NodeRoleName: + Description: The IAM role name of worker nodes. + Type: String + + SSHKeyPair: + Type: String + + UserData: + Type: String + + InstanceType: + Type: String + Description: Efa supports only one instance type in the cluster. eg. p3dn.24xlarge, p4d.24xlarge or p5.48xlarge + Default: "p5.48xlarge" + +Conditions: + IsP4Node: + !Equals [Ref: InstanceType, p4d.24xlarge] + IsP5Node: + !Equals [Ref: InstanceType, p5.48xlarge] + +Resources: + EFASecurityGroup: + Type: "AWS::EC2::SecurityGroup" + Properties: + GroupDescription: Security group for all nodes in the cluster + Tags: + - Key: !Sub kubernetes.io/cluster/${ClusterName} + Value: owned + VpcId: !Ref VpcId + + EFASecurityGroupIngress: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow node to communicate with each other + FromPort: 0 + ToPort: 65535 + GroupId: !Ref EFASecurityGroup + IpProtocol: "-1" + SourceSecurityGroupId: !Ref EFASecurityGroup + + EFASecurityGroupEgress: + Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow the efa worker nodes outbound communication + DestinationSecurityGroupId: !Ref EFASecurityGroup + FromPort: 0 + ToPort: 65536 + GroupId: !Ref EFASecurityGroup + IpProtocol: "-1" + + EFASecurityGroupEgressAllIpv4: + Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow the efa worker nodes outbound communication + FromPort: 0 + ToPort: 65536 + CidrIp: "0.0.0.0/0" + GroupId: !Ref EFASecurityGroup + IpProtocol: "-1" + + EFASecurityGroupEgressAllIpv6: + Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow the efa worker nodes outbound communication + FromPort: 0 + ToPort: 65536 + CidrIpv6: "::/0" + GroupId: !Ref EFASecurityGroup + IpProtocol: "-1" + + EFASecurityGroupIngressSSHIpv4: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow SSH + FromPort: 22 + ToPort: 22 + CidrIp: "0.0.0.0/0" + GroupId: !Ref EFASecurityGroup + IpProtocol: "tcp" + + EFASecurityGroupIngressSSHIpv6: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow SSH + FromPort: 22 + ToPort: 22 + CidrIpv6: "::/0" + GroupId: !Ref EFASecurityGroup + IpProtocol: "tcp" + + EFASecurityGroupIngressControlPlane: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow pods to communicate with the cluster API Server + FromPort: 443 + ToPort: 443 + GroupId: !Ref SecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref EFASecurityGroup + + EFASecurityGroupEgressControlPlane: + Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow the cluster control plane to communicate with worker Kubelet and pods + DestinationSecurityGroupId: !Ref EFASecurityGroup + FromPort: 1025 + ToPort: 65535 + GroupId: !Ref SecurityGroup + IpProtocol: tcp + + ControlPlaneEgressToEFASecurityGroupOn443: + Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow the cluster control plane to communicate with pods running extension API servers on port 443 + DestinationSecurityGroupId: !Ref EFASecurityGroup + FromPort: 443 + ToPort: 443 + GroupId: !Ref SecurityGroup + IpProtocol: tcp + + EFASecurityGroupFromControlPlaneIngress: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow worker Kubelets and pods to receive communication from the cluster control plane + FromPort: 1025 + ToPort: 65535 + GroupId: !Ref EFASecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref SecurityGroup + + EFASecurityGroupFromControlPlaneOn443Ingress: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow pods running extension API servers on port 443 to receive communication from cluster control plane + FromPort: 443 + ToPort: 443 + GroupId: !Ref EFASecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref SecurityGroup + + NodeInstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: "/" + Roles: + - !Ref NodeRoleName + + NodeLaunchTemplate: + Type: "AWS::EC2::LaunchTemplate" + Properties: + LaunchTemplateName: !Ref ResourceId + LaunchTemplateData: + BlockDeviceMappings: + - DeviceName: /dev/xvda + Ebs: + DeleteOnTermination: true + VolumeSize: !Ref NodeDiskSize + VolumeType: gp2 + IamInstanceProfile: + Arn: !GetAtt NodeInstanceProfile.Arn + ImageId: !Ref AMIId + InstanceType: !Ref InstanceType + KeyName: !Ref SSHKeyPair + NetworkInterfaces: !If + - IsP5Node + - + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 0 + DeviceIndex: 0 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 4 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 8 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 12 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 16 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 20 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 24 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 28 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - !If + - IsP4Node + - + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 0 + DeviceIndex: 0 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 1 + DeviceIndex: 1 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 2 + DeviceIndex: 2 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 3 + DeviceIndex: 3 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + - + - Description: NetworkInterfaces Configuration For EFA and EKS + NetworkCardIndex: 0 + DeviceIndex: 0 + InterfaceType: efa + Groups: + - !Ref EFASecurityGroup + UserData: + Fn::Base64: + Fn::Sub: | + Content-Type: multipart/mixed; boundary="BOUNDARY" + MIME-Version: 1.0 + + --BOUNDARY + ${UserData} + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + MIME-Version: 1.0 + + #!/usr/bin/env bash + /opt/aws/bin/cfn-signal \ + --stack ${AWS::StackName} \ + --resource NodeGroup \ + --region ${AWS::Region} + + --BOUNDARY-- + + NodeGroup: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + AutoScalingGroupName: !Ref ResourceId + MixedInstancesPolicy: + InstancesDistribution: + OnDemandAllocationStrategy: "prioritized" + LaunchTemplate: + LaunchTemplateSpecification: + LaunchTemplateId: !Ref NodeLaunchTemplate + Version: !GetAtt NodeLaunchTemplate.LatestVersionNumber + DesiredCapacity: !Ref NodeCount + MinSize: !Ref NodeCount + MaxSize: !Ref NodeCount + VPCZoneIdentifier: !Ref SubnetIds + Tags: + - Key: Name + Value: !Sub ${ClusterName}-Node + PropagateAtLaunch: true + - Key: !Sub kubernetes.io/cluster/${ClusterName} + Value: owned + PropagateAtLaunch: true diff --git a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template index cfecbfe28..da3bb2079 100644 --- a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template +++ b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template @@ -86,15 +86,6 @@ Resources: Value: owned PropagateAtLaunch: true - NodeLifecycleHook: - Type: AWS::AutoScaling::LifecycleHook - Properties: - AutoScalingGroupName: !Ref NodeGroup - HeartbeatTimeout: 60 - DefaultResult: CONTINUE - LifecycleHookName: !Ref ResourceId - LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING - NodeLaunchTemplate: Type: AWS::EC2::LaunchTemplate Properties: