Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect node logs after --up, --down phases #491

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions kubetest2/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ toolchain go1.22.1

require (
github.com/aws/aws-sdk-go v1.51.2
github.com/aws/aws-sdk-go-v2 v1.26.0
github.com/aws/aws-sdk-go-v2 v1.32.2
github.com/aws/aws-sdk-go-v2/config v1.27.8
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.40.4
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.48.0
github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.36.3
github.com/aws/aws-sdk-go-v2/service/ec2 v1.151.1
github.com/aws/aws-sdk-go-v2/service/eks v1.41.2
github.com/aws/smithy-go v1.20.1
github.com/aws/smithy-go v1.22.0
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
github.com/octago/sflags v0.2.0
github.com/pkg/errors v0.9.1
Expand Down Expand Up @@ -69,13 +69,14 @@ require (
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.17.8 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect
github.com/aws/aws-sdk-go-v2/service/ecr v1.27.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.23.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.1 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6 // indirect
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.3 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.28.5 // indirect
Expand Down
10 changes: 10 additions & 0 deletions kubetest2/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ github.com/aws/aws-sdk-go v1.51.2 h1:Ruwgz5aqIXin5Yfcgc+PCzoqW5tEGb9aDL/JWDsre7k
github.com/aws/aws-sdk-go v1.51.2/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/aws/aws-sdk-go-v2 v1.26.0 h1:/Ce4OCiM3EkpW7Y+xUnfAFpchU78K7/Ug01sZni9PgA=
github.com/aws/aws-sdk-go-v2 v1.26.0/go.mod h1:35hUlJVYd+M++iLI3ALmVwMOyRYMmRqUXpTtRGW+K9I=
github.com/aws/aws-sdk-go-v2 v1.32.2 h1:AkNLZEyYMLnx/Q/mSKkcMqwNFXMAvFto9bNsHqcTduI=
github.com/aws/aws-sdk-go-v2 v1.32.2/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo=
github.com/aws/aws-sdk-go-v2/config v1.27.8 h1:0r8epOsiJ7YJz65MGcb8i91ehFp4kvvFe2qkq5oYeRI=
github.com/aws/aws-sdk-go-v2/config v1.27.8/go.mod h1:XsmYKxYNuIhLsFddpNds+j9H5XKzjWDdg/SZngiwFio=
github.com/aws/aws-sdk-go-v2/credentials v1.17.8 h1:WUdNLXbyNbU07V/WFrSOBXqZTDgmmMNMgUFzpYOKJhw=
Expand All @@ -138,8 +140,12 @@ github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4 h1:S+L2QSKhUuShih3aq9P/mkz
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4/go.mod h1:nQ3how7DMnFMWiU1SpECohgC82fpn4cKZ875NDMmwtA=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 h1:0ScVK/4qZ8CIW0k8jOeFVsyS/sAiXpYxRBLolMkuLQM=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4/go.mod h1:84KyjNZdHC6QZW08nfHI6yZgPd+qRgaWcYsyLUo3QY8=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21 h1:UAsR3xA31QGf79WzpG/ixT9FZvQlh5HY1NRqSHBNOCk=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21/go.mod h1:JNr43NFf5L9YaG3eKTm7HQzls9J+A9YYcGI5Quh1r2Y=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 h1:sHmMWWX5E7guWEFQ9SVo6A3S4xpPrWnd77a6y4WM6PU=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4/go.mod h1:WjpDrhWisWOIoS9n3nk67A3Ll1vfULJ9Kq6h29HTD48=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 h1:6jZVETqmYCadGFvrYEQfC5fAQmlo80CeL5psbno6r0s=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21/go.mod h1:1SR0GbLlnN3QUmYaflZNiH1ql+1qrSiB2vwcJ+4UM60=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY=
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.40.4 h1:f4pkN5PVSqlGxD2gZvboz6SRaeoykgknflMPBVuhcGs=
Expand All @@ -162,6 +168,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6 h1:b+E7zIUHM
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6/go.mod h1:S2fNV0rxrP78NhPbCZeQgY8H9jdDMeGtwcfZIRxzBqU=
github.com/aws/aws-sdk-go-v2/service/kms v1.27.9 h1:W9PbZAZAEcelhhjb7KuwUtf+Lbc+i7ByYJRuWLlnxyQ=
github.com/aws/aws-sdk-go-v2/service/kms v1.27.9/go.mod h1:2tFmR7fQnOdQlM2ZCEPpFnBIQD1U8wmXmduBgZbOag0=
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2 h1:z6Pq4+jtKlhK4wWJGHRGwMLGjC1HZwAO3KJr/Na0tSU=
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2/go.mod h1:DSmu/VZzpQlAubWBbAvNpt+S4k/XweglJi4XaDGyvQk=
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3 h1:mnbuWHOcM70/OFUlZZ5rcdfA8PflGXXiefU/O+1S3+8=
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3/go.mod h1:5HFu51Elk+4oRBZVxmHrSds5jFXmFj8C3w7DVF2gnrs=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.3 h1:uLq0BKatTmDzWa/Nu4WO0M1AaQDaPpwTKAeByEc6WFM=
Expand All @@ -170,6 +178,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.28.5 h1:J/PpTf/hllOjx8Xu9DMflff3Fajf
github.com/aws/aws-sdk-go-v2/service/sts v1.28.5/go.mod h1:0ih0Z83YDH/QeQ6Ori2yGE2XvWYv/Xm+cZc01LC6oK0=
github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw=
github.com/aws/smithy-go v1.20.1/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E=
github.com/aws/smithy-go v1.22.0 h1:uunKnWlcoL3zO7q+gG2Pk53joueEOsnNB28QdMsmiMM=
github.com/aws/smithy-go v1.22.0/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412 h1:tfbmGNeOidVXzO1I7zo/WsT5QX7Aa0BGTbnEAE4FG3E=
github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412/go.mod h1:kcUkjB9HwuV7PSck2b60kJtgDy+eTHWuAP0kb93FXsk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down
7 changes: 7 additions & 0 deletions kubetest2/internal/deployers/eksapi/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,23 @@ import (
"github.com/aws/aws-sdk-go-v2/service/cloudformation"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/eks"
"github.com/aws/aws-sdk-go-v2/service/ssm"
)

type awsClients struct {
_eks *eks.Client
_cfn *cloudformation.Client
_ec2 *ec2.Client
_asg *autoscaling.Client
_ssm *ssm.Client
}

func newAWSClients(config aws.Config, eksEndpointURL string) *awsClients {
clients := awsClients{
_cfn: cloudformation.NewFromConfig(config),
_ec2: ec2.NewFromConfig(config),
_asg: autoscaling.NewFromConfig(config),
_ssm: ssm.NewFromConfig(config),
}
if eksEndpointURL != "" {
clients._eks = eks.NewFromConfig(config, func(o *eks.Options) {
Expand All @@ -46,3 +49,7 @@ func (c *awsClients) EC2() *ec2.Client {
func (c *awsClients) ASG() *autoscaling.Client {
return c._asg
}

func (c *awsClients) SSM() *ssm.Client {
return c._ssm
}
36 changes: 25 additions & 11 deletions kubetest2/internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/octago/sflags/gen/gpflag"
"github.com/spf13/pflag"
"golang.org/x/exp/slices"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
"sigs.k8s.io/kubetest2/pkg/types"
)
Expand All @@ -41,12 +42,15 @@ type deployer struct {
clusterManager *ClusterManager
addonManager *AddonManager
nodegroupManager *NodegroupManager
logManager *logManager

awsClients *awsClients

infra *Infrastructure
cluster *Cluster

k8sClient *kubernetes.Clientset

initTime time.Time
}

Expand All @@ -65,6 +69,7 @@ type deployerOptions struct {
IPFamily string `flag:"ip-family" desc:"IP family for the cluster (ipv4 or ipv6)"`
KubeconfigPath string `flag:"kubeconfig" desc:"Path to kubeconfig"`
KubernetesVersion string `flag:"kubernetes-version" desc:"cluster Kubernetes version"`
LogBucket string `flag:"log-bucket" desc:"S3 bucket for storing logs for each run. If empty, logs will not be stored."`
NodeCreationTimeout time.Duration `flag:"node-creation-timeout" desc:"Time to wait for nodes to be created/launched. This should consider instance availability."`
NodeReadyTimeout time.Duration `flag:"node-ready-timeout" desc:"Time to wait for all nodes to become ready"`
Nodes int `flag:"nodes" desc:"number of nodes to launch in cluster"`
Expand Down Expand Up @@ -117,6 +122,7 @@ func (d *deployer) Init() error {
d.clusterManager = NewClusterManager(d.awsClients, resourceID)
d.addonManager = NewAddonManager(d.awsClients)
d.nodegroupManager = NewNodegroupManager(d.awsClients, resourceID)
d.logManager = NewLogManager(d.awsClients, resourceID)
return nil
}

Expand Down Expand Up @@ -171,12 +177,12 @@ func (d *deployer) Up() error {
if err != nil {
return err
}
k8sClient, err := newKubernetesClient(kubeconfig)
d.k8sClient, err = newKubernetesClient(kubeconfig)
if err != nil {
return err
}
if d.UnmanagedNodes {
if err := createAWSAuthConfigMap(k8sClient, d.NodeNameStrategy, d.infra.nodeRole); err != nil {
if err := createAWSAuthConfigMap(d.k8sClient, d.NodeNameStrategy, d.infra.nodeRole); err != nil {
return err
}
}
Expand All @@ -187,21 +193,25 @@ func (d *deployer) Up() error {
return err
}
if d.deployerOptions.TuneVPCCNI {
if err := tuneVPCCNI(k8sClient); err != nil {
if err := tuneVPCCNI(d.k8sClient); err != nil {
return err
}
}
if err := d.nodegroupManager.createNodegroup(d.infra, d.cluster, &d.deployerOptions); err != nil {
return err
}
if err := waitForReadyNodes(k8sClient, d.Nodes, d.NodeReadyTimeout); err != nil {
if err := waitForReadyNodes(d.k8sClient, d.Nodes, d.NodeReadyTimeout); err != nil {
return err
}
if d.EmitMetrics {
if err := emitNodeMetrics(d.metrics, k8sClient, d.awsClients.EC2()); err != nil {
if err := emitNodeMetrics(d.metrics, d.k8sClient, d.awsClients.EC2()); err != nil {
return err
}
}
if err := d.logManager.gatherLogsFromNodes(d.k8sClient, &d.deployerOptions, deployerPhaseUp); err != nil {
klog.Warningf("failed to gather logs from nodes: %v", err)
// don't return err, this isn't critical
}
return nil
}

Expand All @@ -220,11 +230,11 @@ func (d *deployer) verifyUpFlags() error {
}
if d.Nodes == 0 {
d.Nodes = 3
klog.V(2).Infof("Using default number of nodes: %d", d.Nodes)
klog.Infof("Using default number of nodes: %d", d.Nodes)
}
if d.IPFamily == "" {
d.IPFamily = string(ekstypes.IpFamilyIpv4)
klog.V(2).Infof("Using default IP family: %s", d.IPFamily)
klog.Infof("Using default IP family: %s", d.IPFamily)
}
if d.UnmanagedNodes {
if d.AMI == "" {
Expand All @@ -235,15 +245,15 @@ func (d *deployer) verifyUpFlags() error {
}
if d.NodeNameStrategy == "" {
d.NodeNameStrategy = "EC2PrivateDNSName"
klog.V(2).Infof("Using default node name strategy: EC2PrivateDNSName")
klog.Infof("Using default node name strategy: EC2PrivateDNSName")
} else {
if !slices.Contains(SupportedNodeNameStrategy, d.NodeNameStrategy) {
return fmt.Errorf("--node-name-strategy must be one of the following values: ['SessionName', 'EC2PrivateDNSName']")
}
}
if d.UserDataFormat == "" {
d.UserDataFormat = "bootstrap.sh"
klog.V(2).Infof("Using default user data format: %s", d.UserDataFormat)
klog.Infof("Using default user data format: %s", d.UserDataFormat)
}
if d.EFA && len(d.InstanceTypes) != 1 {
return fmt.Errorf("--efa requires a single instance type")
Expand All @@ -253,8 +263,8 @@ func (d *deployer) verifyUpFlags() error {
return fmt.Errorf("--ami should not be provided without --unmanaged-nodes")
}
if d.AMIType == "" {
d.AMIType = "AL2023_STANDARD_X86_64"
klog.V(2).Infof("Using default AMI type: %s", d.AMIType)
d.AMIType = "AL2023_x86_64_STANDARD"
klog.Infof("Using default AMI type: %s", d.AMIType)
}
}
if d.NodeCreationTimeout == 0 {
Expand Down Expand Up @@ -283,6 +293,10 @@ func (d *deployer) IsUp() (up bool, err error) {
}

func (d *deployer) Down() error {
if err := d.logManager.gatherLogsFromNodes(d.k8sClient, &d.deployerOptions, deployerPhaseDown); err != nil {
klog.Warningf("failed to gather logs from nodes: %v", err)
// don't return err, this isn't critical
}
return deleteResources(d.infraManager, d.clusterManager, d.nodegroupManager)
}

Expand Down
17 changes: 17 additions & 0 deletions kubetest2/internal/deployers/eksapi/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,23 @@ func createAWSAuthConfigMap(client *kubernetes.Clientset, nodeNameStrategy strin
return err
}

func getNodeInstanceIDs(nodes []corev1.Node) ([]string, error) {
var instanceIds []string
var errs []error
for _, node := range nodes {
providerId, err := parseKubernetesProviderID(node.Spec.ProviderID)
if err != nil {
errs = append(errs, err)
continue
}
instanceIds = append(instanceIds, providerId.InstanceID)
}
if len(errs) > 0 {
return nil, errors.Join(errs...)
}
return instanceIds, nil
}

func emitNodeMetrics(metricRegistry metrics.MetricRegistry, k8sClient *kubernetes.Clientset, ec2Client *ec2.Client) error {
nodes, err := getReadyNodes(k8sClient)
if err != nil {
Expand Down
111 changes: 111 additions & 0 deletions kubetest2/internal/deployers/eksapi/logs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package eksapi

import (
"context"
_ "embed"
"errors"
"fmt"
"time"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ssm"
ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
)

type logManager struct {
clients *awsClients
resourceID string
}

type deployerPhase string

const (
deployerPhaseUp = "up"
deployerPhaseDown = "down"
)

func NewLogManager(clients *awsClients, resourceID string) *logManager {
return &logManager{
clients: clients,
resourceID: resourceID,
}
}

func (m *logManager) gatherLogsFromNodes(k8sClient *kubernetes.Clientset, opts *deployerOptions, phase deployerPhase) error {
if opts.LogBucket == "" {
klog.Info("--log-bucket is empty, no logs will be gathered!")
return nil
}
switch opts.UserDataFormat {
case "bootstrap.sh", "nodeadm", "": // if no --user-data-format was passed, we must be using managed nodes, which default to AL-based AMIs
return m.gatherLogsUsingScript(k8sClient, opts, phase)
default:
klog.Warningf("unable to gather logs for userDataFormat: %s\n", opts.UserDataFormat)
return nil
}
}

//go:embed logs_ssm_doc.json
var logCollectorScriptSsmDocumentContent string

const logCollectorSsmDocumentTimeout = 5 * time.Minute

func (m *logManager) gatherLogsUsingScript(k8sClient *kubernetes.Clientset, opts *deployerOptions, phase deployerPhase) error {
klog.Info("gathering logs from nodes...")
nodes, err := k8sClient.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{})
if err != nil {
return err
}
if len(nodes.Items) == 0 {
klog.Warning("no nodes to gather logs from!")
return nil
}
instanceIds, err := getNodeInstanceIDs(nodes.Items)
if err != nil {
return err
}
doc, err := m.clients.SSM().CreateDocument(context.TODO(), &ssm.CreateDocumentInput{
Content: aws.String(logCollectorScriptSsmDocumentContent),
Name: aws.String(fmt.Sprintf("%s-log-collector", m.resourceID)),
DocumentType: ssmtypes.DocumentTypeCommand,
DocumentFormat: ssmtypes.DocumentFormatJson,
})
if err != nil {
return err
}
defer func() {
m.clients.SSM().DeleteDocument(context.TODO(), &ssm.DeleteDocumentInput{
Name: doc.DocumentDescription.Name,
})
}()
command, err := m.clients.SSM().SendCommand(context.TODO(), &ssm.SendCommandInput{
DocumentName: doc.DocumentDescription.Name,
InstanceIds: instanceIds,
Parameters: map[string][]string{
"s3Destination": {fmt.Sprintf("s3://%s/node-logs/%s/%s/", opts.LogBucket, m.resourceID, phase)},
},
})
if err != nil {
return err
}
var errs []error
for _, instanceId := range instanceIds {
out, err := ssm.NewCommandExecutedWaiter(m.clients.SSM()).WaitForOutput(context.TODO(), &ssm.GetCommandInvocationInput{
CommandId: command.Command.CommandId,
InstanceId: aws.String(instanceId),
}, logCollectorSsmDocumentTimeout)
if err != nil {
errs = append(errs, err)
} else {
klog.Infof("log collection command for %s: %s", instanceId, out.Status)
}
}
if len(errs) > 0 {
return errors.Join(errs...)
}
klog.Infof("gathered logs from nodes: %v", instanceIds)
return nil
}
Loading
Loading