Skip to content

Commit

Permalink
Add node log collection
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermckinnon committed Oct 15, 2024
1 parent 2125490 commit 18b1005
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 15 deletions.
9 changes: 5 additions & 4 deletions kubetest2/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ toolchain go1.22.1

require (
github.com/aws/aws-sdk-go v1.51.2
github.com/aws/aws-sdk-go-v2 v1.26.0
github.com/aws/aws-sdk-go-v2 v1.32.2
github.com/aws/aws-sdk-go-v2/config v1.27.8
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.40.4
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.48.0
github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.36.3
github.com/aws/aws-sdk-go-v2/service/ec2 v1.151.1
github.com/aws/aws-sdk-go-v2/service/eks v1.41.2
github.com/aws/smithy-go v1.20.1
github.com/aws/smithy-go v1.22.0
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
github.com/octago/sflags v0.2.0
github.com/pkg/errors v0.9.1
Expand Down Expand Up @@ -69,13 +69,14 @@ require (
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.17.8 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect
github.com/aws/aws-sdk-go-v2/service/ecr v1.27.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.23.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.1 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6 // indirect
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.3 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.28.5 // indirect
Expand Down
10 changes: 10 additions & 0 deletions kubetest2/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ github.com/aws/aws-sdk-go v1.51.2 h1:Ruwgz5aqIXin5Yfcgc+PCzoqW5tEGb9aDL/JWDsre7k
github.com/aws/aws-sdk-go v1.51.2/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/aws/aws-sdk-go-v2 v1.26.0 h1:/Ce4OCiM3EkpW7Y+xUnfAFpchU78K7/Ug01sZni9PgA=
github.com/aws/aws-sdk-go-v2 v1.26.0/go.mod h1:35hUlJVYd+M++iLI3ALmVwMOyRYMmRqUXpTtRGW+K9I=
github.com/aws/aws-sdk-go-v2 v1.32.2 h1:AkNLZEyYMLnx/Q/mSKkcMqwNFXMAvFto9bNsHqcTduI=
github.com/aws/aws-sdk-go-v2 v1.32.2/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo=
github.com/aws/aws-sdk-go-v2/config v1.27.8 h1:0r8epOsiJ7YJz65MGcb8i91ehFp4kvvFe2qkq5oYeRI=
github.com/aws/aws-sdk-go-v2/config v1.27.8/go.mod h1:XsmYKxYNuIhLsFddpNds+j9H5XKzjWDdg/SZngiwFio=
github.com/aws/aws-sdk-go-v2/credentials v1.17.8 h1:WUdNLXbyNbU07V/WFrSOBXqZTDgmmMNMgUFzpYOKJhw=
Expand All @@ -138,8 +140,12 @@ github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4 h1:S+L2QSKhUuShih3aq9P/mkz
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.4/go.mod h1:nQ3how7DMnFMWiU1SpECohgC82fpn4cKZ875NDMmwtA=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 h1:0ScVK/4qZ8CIW0k8jOeFVsyS/sAiXpYxRBLolMkuLQM=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4/go.mod h1:84KyjNZdHC6QZW08nfHI6yZgPd+qRgaWcYsyLUo3QY8=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21 h1:UAsR3xA31QGf79WzpG/ixT9FZvQlh5HY1NRqSHBNOCk=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21/go.mod h1:JNr43NFf5L9YaG3eKTm7HQzls9J+A9YYcGI5Quh1r2Y=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 h1:sHmMWWX5E7guWEFQ9SVo6A3S4xpPrWnd77a6y4WM6PU=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4/go.mod h1:WjpDrhWisWOIoS9n3nk67A3Ll1vfULJ9Kq6h29HTD48=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 h1:6jZVETqmYCadGFvrYEQfC5fAQmlo80CeL5psbno6r0s=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21/go.mod h1:1SR0GbLlnN3QUmYaflZNiH1ql+1qrSiB2vwcJ+4UM60=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY=
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.40.4 h1:f4pkN5PVSqlGxD2gZvboz6SRaeoykgknflMPBVuhcGs=
Expand All @@ -162,6 +168,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6 h1:b+E7zIUHM
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.6/go.mod h1:S2fNV0rxrP78NhPbCZeQgY8H9jdDMeGtwcfZIRxzBqU=
github.com/aws/aws-sdk-go-v2/service/kms v1.27.9 h1:W9PbZAZAEcelhhjb7KuwUtf+Lbc+i7ByYJRuWLlnxyQ=
github.com/aws/aws-sdk-go-v2/service/kms v1.27.9/go.mod h1:2tFmR7fQnOdQlM2ZCEPpFnBIQD1U8wmXmduBgZbOag0=
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2 h1:z6Pq4+jtKlhK4wWJGHRGwMLGjC1HZwAO3KJr/Na0tSU=
github.com/aws/aws-sdk-go-v2/service/ssm v1.55.2/go.mod h1:DSmu/VZzpQlAubWBbAvNpt+S4k/XweglJi4XaDGyvQk=
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3 h1:mnbuWHOcM70/OFUlZZ5rcdfA8PflGXXiefU/O+1S3+8=
github.com/aws/aws-sdk-go-v2/service/sso v1.20.3/go.mod h1:5HFu51Elk+4oRBZVxmHrSds5jFXmFj8C3w7DVF2gnrs=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.3 h1:uLq0BKatTmDzWa/Nu4WO0M1AaQDaPpwTKAeByEc6WFM=
Expand All @@ -170,6 +178,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.28.5 h1:J/PpTf/hllOjx8Xu9DMflff3Fajf
github.com/aws/aws-sdk-go-v2/service/sts v1.28.5/go.mod h1:0ih0Z83YDH/QeQ6Ori2yGE2XvWYv/Xm+cZc01LC6oK0=
github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw=
github.com/aws/smithy-go v1.20.1/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E=
github.com/aws/smithy-go v1.22.0 h1:uunKnWlcoL3zO7q+gG2Pk53joueEOsnNB28QdMsmiMM=
github.com/aws/smithy-go v1.22.0/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412 h1:tfbmGNeOidVXzO1I7zo/WsT5QX7Aa0BGTbnEAE4FG3E=
github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412/go.mod h1:kcUkjB9HwuV7PSck2b60kJtgDy+eTHWuAP0kb93FXsk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down
7 changes: 7 additions & 0 deletions kubetest2/internal/deployers/eksapi/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,23 @@ import (
"github.com/aws/aws-sdk-go-v2/service/cloudformation"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/eks"
"github.com/aws/aws-sdk-go-v2/service/ssm"
)

type awsClients struct {
_eks *eks.Client
_cfn *cloudformation.Client
_ec2 *ec2.Client
_asg *autoscaling.Client
_ssm *ssm.Client
}

func newAWSClients(config aws.Config, eksEndpointURL string) *awsClients {
clients := awsClients{
_cfn: cloudformation.NewFromConfig(config),
_ec2: ec2.NewFromConfig(config),
_asg: autoscaling.NewFromConfig(config),
_ssm: ssm.NewFromConfig(config),
}
if eksEndpointURL != "" {
clients._eks = eks.NewFromConfig(config, func(o *eks.Options) {
Expand All @@ -46,3 +49,7 @@ func (c *awsClients) EC2() *ec2.Client {
func (c *awsClients) ASG() *autoscaling.Client {
return c._asg
}

func (c *awsClients) SSM() *ssm.Client {
return c._ssm
}
36 changes: 25 additions & 11 deletions kubetest2/internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/octago/sflags/gen/gpflag"
"github.com/spf13/pflag"
"golang.org/x/exp/slices"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
"sigs.k8s.io/kubetest2/pkg/types"
)
Expand All @@ -41,12 +42,15 @@ type deployer struct {
clusterManager *ClusterManager
addonManager *AddonManager
nodegroupManager *NodegroupManager
logManager *logManager

awsClients *awsClients

infra *Infrastructure
cluster *Cluster

k8sClient *kubernetes.Clientset

initTime time.Time
}

Expand All @@ -65,6 +69,7 @@ type deployerOptions struct {
IPFamily string `flag:"ip-family" desc:"IP family for the cluster (ipv4 or ipv6)"`
KubeconfigPath string `flag:"kubeconfig" desc:"Path to kubeconfig"`
KubernetesVersion string `flag:"kubernetes-version" desc:"cluster Kubernetes version"`
LogBucket string `flag:"log-bucket" desc:"S3 bucket for storing logs for each run. If empty, logs will not be stored."`
NodeCreationTimeout time.Duration `flag:"node-creation-timeout" desc:"Time to wait for nodes to be created/launched. This should consider instance availability."`
NodeReadyTimeout time.Duration `flag:"node-ready-timeout" desc:"Time to wait for all nodes to become ready"`
Nodes int `flag:"nodes" desc:"number of nodes to launch in cluster"`
Expand Down Expand Up @@ -117,6 +122,7 @@ func (d *deployer) Init() error {
d.clusterManager = NewClusterManager(d.awsClients, resourceID)
d.addonManager = NewAddonManager(d.awsClients)
d.nodegroupManager = NewNodegroupManager(d.awsClients, resourceID)
d.logManager = NewLogManager(d.awsClients, resourceID)
return nil
}

Expand Down Expand Up @@ -171,12 +177,12 @@ func (d *deployer) Up() error {
if err != nil {
return err
}
k8sClient, err := newKubernetesClient(kubeconfig)
d.k8sClient, err = newKubernetesClient(kubeconfig)
if err != nil {
return err
}
if d.UnmanagedNodes {
if err := createAWSAuthConfigMap(k8sClient, d.NodeNameStrategy, d.infra.nodeRole); err != nil {
if err := createAWSAuthConfigMap(d.k8sClient, d.NodeNameStrategy, d.infra.nodeRole); err != nil {
return err
}
}
Expand All @@ -187,21 +193,25 @@ func (d *deployer) Up() error {
return err
}
if d.deployerOptions.TuneVPCCNI {
if err := tuneVPCCNI(k8sClient); err != nil {
if err := tuneVPCCNI(d.k8sClient); err != nil {
return err
}
}
if err := d.nodegroupManager.createNodegroup(d.infra, d.cluster, &d.deployerOptions); err != nil {
return err
}
if err := waitForReadyNodes(k8sClient, d.Nodes, d.NodeReadyTimeout); err != nil {
if err := waitForReadyNodes(d.k8sClient, d.Nodes, d.NodeReadyTimeout); err != nil {
return err
}
if d.EmitMetrics {
if err := emitNodeMetrics(d.metrics, k8sClient, d.awsClients.EC2()); err != nil {
if err := emitNodeMetrics(d.metrics, d.k8sClient, d.awsClients.EC2()); err != nil {
return err
}
}
if err := d.logManager.gatherLogsFromNodes(d.k8sClient, &d.deployerOptions, deployerPhaseUp); err != nil {
klog.Warningf("failed to gather logs from nodes: %v", err)
// don't return err, this isn't critical
}
return nil
}

Expand All @@ -220,11 +230,11 @@ func (d *deployer) verifyUpFlags() error {
}
if d.Nodes == 0 {
d.Nodes = 3
klog.V(2).Infof("Using default number of nodes: %d", d.Nodes)
klog.Infof("Using default number of nodes: %d", d.Nodes)
}
if d.IPFamily == "" {
d.IPFamily = string(ekstypes.IpFamilyIpv4)
klog.V(2).Infof("Using default IP family: %s", d.IPFamily)
klog.Infof("Using default IP family: %s", d.IPFamily)
}
if d.UnmanagedNodes {
if d.AMI == "" {
Expand All @@ -235,15 +245,15 @@ func (d *deployer) verifyUpFlags() error {
}
if d.NodeNameStrategy == "" {
d.NodeNameStrategy = "EC2PrivateDNSName"
klog.V(2).Infof("Using default node name strategy: EC2PrivateDNSName")
klog.Infof("Using default node name strategy: EC2PrivateDNSName")
} else {
if !slices.Contains(SupportedNodeNameStrategy, d.NodeNameStrategy) {
return fmt.Errorf("--node-name-strategy must be one of the following values: ['SessionName', 'EC2PrivateDNSName']")
}
}
if d.UserDataFormat == "" {
d.UserDataFormat = "bootstrap.sh"
klog.V(2).Infof("Using default user data format: %s", d.UserDataFormat)
klog.Infof("Using default user data format: %s", d.UserDataFormat)
}
if d.EFA && len(d.InstanceTypes) != 1 {
return fmt.Errorf("--efa requires a single instance type")
Expand All @@ -253,8 +263,8 @@ func (d *deployer) verifyUpFlags() error {
return fmt.Errorf("--ami should not be provided without --unmanaged-nodes")
}
if d.AMIType == "" {
d.AMIType = "AL2023_STANDARD_X86_64"
klog.V(2).Infof("Using default AMI type: %s", d.AMIType)
d.AMIType = "AL2023_x86_64_STANDARD"
klog.Infof("Using default AMI type: %s", d.AMIType)
}
}
if d.NodeCreationTimeout == 0 {
Expand Down Expand Up @@ -283,6 +293,10 @@ func (d *deployer) IsUp() (up bool, err error) {
}

func (d *deployer) Down() error {
if err := d.logManager.gatherLogsFromNodes(d.k8sClient, &d.deployerOptions, deployerPhaseDown); err != nil {
klog.Warningf("failed to gather logs from nodes: %v", err)
// don't return err, this isn't critical
}
return deleteResources(d.infraManager, d.clusterManager, d.nodegroupManager)
}

Expand Down
17 changes: 17 additions & 0 deletions kubetest2/internal/deployers/eksapi/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,23 @@ func createAWSAuthConfigMap(client *kubernetes.Clientset, nodeNameStrategy strin
return err
}

func getNodeInstanceIDs(nodes []corev1.Node) ([]string, error) {
var instanceIds []string
var errs []error
for _, node := range nodes {
providerId, err := parseKubernetesProviderID(node.Spec.ProviderID)
if err != nil {
errs = append(errs, err)
continue
}
instanceIds = append(instanceIds, providerId.InstanceID)
}
if len(errs) > 0 {
return nil, errors.Join(errs...)
}
return instanceIds, nil
}

func emitNodeMetrics(metricRegistry metrics.MetricRegistry, k8sClient *kubernetes.Clientset, ec2Client *ec2.Client) error {
nodes, err := getReadyNodes(k8sClient)
if err != nil {
Expand Down
111 changes: 111 additions & 0 deletions kubetest2/internal/deployers/eksapi/logs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package eksapi

import (
"context"
_ "embed"
"errors"
"fmt"
"time"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ssm"
ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
)

type logManager struct {
clients *awsClients
resourceID string
}

type deployerPhase string

const (
deployerPhaseUp = "up"
deployerPhaseDown = "down"
)

func NewLogManager(clients *awsClients, resourceID string) *logManager {
return &logManager{
clients: clients,
resourceID: resourceID,
}
}

func (m *logManager) gatherLogsFromNodes(k8sClient *kubernetes.Clientset, opts *deployerOptions, phase deployerPhase) error {
if opts.LogBucket == "" {
klog.Info("--log-bucket is empty, no logs will be gathered!")
return nil
}
switch opts.UserDataFormat {
case "bootstrap.sh", "nodeadm", "": // if no --user-data-format was passed, we must be using managed nodes, which default to AL-based AMIs
return m.gatherLogsUsingScript(k8sClient, opts, phase)
default:
klog.Warningf("unable to gather logs for userDataFormat: %s\n", opts.UserDataFormat)
return nil
}
}

//go:embed logs_ssm_doc.json
var logCollectorScriptSsmDocumentContent string

const logCollectorSsmDocumentTimeout = 5 * time.Minute

func (m *logManager) gatherLogsUsingScript(k8sClient *kubernetes.Clientset, opts *deployerOptions, phase deployerPhase) error {
klog.Info("gathering logs from nodes...")
nodes, err := k8sClient.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{})
if err != nil {
return err
}
if len(nodes.Items) == 0 {
klog.Warning("no nodes to gather logs from!")
return nil
}
instanceIds, err := getNodeInstanceIDs(nodes.Items)
if err != nil {
return err
}
doc, err := m.clients.SSM().CreateDocument(context.TODO(), &ssm.CreateDocumentInput{
Content: aws.String(logCollectorScriptSsmDocumentContent),
Name: aws.String(fmt.Sprintf("%s-log-collector", m.resourceID)),
DocumentType: ssmtypes.DocumentTypeCommand,
DocumentFormat: ssmtypes.DocumentFormatJson,
})
if err != nil {
return err
}
defer func() {
m.clients.SSM().DeleteDocument(context.TODO(), &ssm.DeleteDocumentInput{
Name: doc.DocumentDescription.Name,
})
}()
command, err := m.clients.SSM().SendCommand(context.TODO(), &ssm.SendCommandInput{
DocumentName: doc.DocumentDescription.Name,
InstanceIds: instanceIds,
Parameters: map[string][]string{
"s3Destination": {fmt.Sprintf("s3://%s/node-logs/%s/%s/", opts.LogBucket, m.resourceID, phase)},
},
})
if err != nil {
return err
}
var errs []error
for _, instanceId := range instanceIds {
out, err := ssm.NewCommandExecutedWaiter(m.clients.SSM()).WaitForOutput(context.TODO(), &ssm.GetCommandInvocationInput{
CommandId: command.Command.CommandId,
InstanceId: aws.String(instanceId),
}, logCollectorSsmDocumentTimeout)
if err != nil {
errs = append(errs, err)
} else {
klog.Infof("log collection command for %s: %s", instanceId, out.Status)
}
}
if len(errs) > 0 {
return errors.Join(errs...)
}
klog.Infof("gathered logs from nodes: %v", instanceIds)
return nil
}
Loading

0 comments on commit 18b1005

Please sign in to comment.