Skip to content

Commit

Permalink
Merge pull request #53 from Cameronsplaze/cloudwatch-metric-dashboard
Browse files Browse the repository at this point in the history
Adding Cloudwatch Dashboard to help monitor state of each leaf stack, and debug problems
  • Loading branch information
Cameronsplaze authored Oct 13, 2024
2 parents c2c7c11 + 8f1bd4b commit ba87d02
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 155 deletions.
19 changes: 9 additions & 10 deletions .github/workflows/main-pipeline-cdk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ on:
branches:
- main


#### Overall Strategy:
# If Open a PR: Make sure everything can Synth (No deploy)
# If Merge a PR: Deploy to your account (No synth, it just passed)
# If workflow_dispatch: Synth first, then deploy if pass

env:
EXAMPLES_PATH: ./Examples # No trailing slash plz!
Expand All @@ -40,7 +43,7 @@ jobs:
cdk-synth:
if: ( github.event_name == 'pull_request' && github.event.action != 'closed' ) ||
( github.event_name == 'workflow_dispatch')
( github.event_name == 'workflow_dispatch' )
runs-on: ubuntu-latest
needs:
- setup-matrix
Expand All @@ -67,24 +70,20 @@ jobs:

cdk-deploy:
## If the PR is merged, or if we manually trigger it (MAIN ONLY):
# !failure() && !cancelled(): Let it check the 'if' block here, to see if the last job was just skipped.
# !failure() && !cancelled(): Let it check the 'if' block here, EVEN if cdk-synth skipped.
# (it'll skip when PR is merged. No point in synthing again since it has to pass TO merge)
if: |
!failure() && !cancelled() && (
( github.event_name == 'pull_request' && github.event.pull_request.merged && needs.cdk-synth.result == 'skipped' ) ||
( github.event_name == 'pull_request' && github.event.pull_request.merged ) ||
( github.event_name == 'workflow_dispatch' )
)
# if: !failure() && !cancelled() && (
# ( github.event_name == 'pull_request' && github.event.pull_request.merged && needs.cdk-synth.result == 'skipped') ||
# ( github.event_name == 'workflow_dispatch' )
# )
runs-on: ubuntu-latest
needs:
- setup-matrix
- cdk-synth
steps:
# Moved this check away from the job-level 'if', so it actually fails if you're
# not on main and it's easier to see.
# not on main. Failure is easier to see than a skipped job.
- name: Fail if not 'main' (else skips this step)
# Yes the pull_request line is redundant with the 'branch' trigger at the top of the
# file, but better safe than sorry. Keeps the logic uniform with workflow_dispatch too.
Expand Down
3 changes: 0 additions & 3 deletions ContainerManager/base_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
from constructs import Construct
from aws_cdk import (
Stack,
Tags,
Duration,
RemovalPolicy,
aws_ec2 as ec2,
aws_route53 as route53,
aws_sns as sns,
aws_iam as iam,
aws_cloudwatch as cloudwatch,
)

from cdk_nag import NagSuppressions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,4 +178,4 @@ def __init__(
}
],
apply_to_children=True,
)
)
246 changes: 214 additions & 32 deletions ContainerManager/leaf_stack/NestedStacks/Dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
)
from constructs import Construct

from ContainerManager.leaf_stack.domain_stack import DomainStack
## Import the other Nested Stacks:
from . import Container, EcsAsg, Watchdog, AsgStateChangeHook

### Nested Stack info:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.NestedStack.html
class Dashboard(NestedStack):
Expand All @@ -21,49 +25,227 @@ class Dashboard(NestedStack):
def __init__(
self,
scope: Construct,
leaf_construct_id: str,
application_id: str,
container_id: str,
main_config: dict,

domain_stack: DomainStack,
container_nested_stack: Container,
ecs_asg_nested_stack: EcsAsg,
watchdog_nested_stack: Watchdog,
asg_state_change_hook_nested_stack: AsgStateChangeHook,
**kwargs
) -> None:
super().__init__(scope, "DashboardNestedStack", **kwargs)
## YES This isn't the "optimum" way if the dashboard is in the same
# stack as everything it's watching. Once the bug listed above is fixed,
# I want to move this nested stack to the base_stack, and have ALL the
# leaf_stacks push to the same dashboard. That way you can see how they
# compare to one another easily too.

#######################
### Dashboard stuff ###
#######################
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Dashboard.html
self.dashboard = cloudwatch.Dashboard(
self,
"CloudwatchDashboard",
dashboard_name=f"{leaf_construct_id}-dashboard",
period_override=cloudwatch.PeriodOverride.AUTO,
# Config options for specifically this stack:
dashboard_config = main_config["Dashboard"]

############
### Metrics used in the Widgets below:

## ASG State Change Invocation Count:
metric_asg_lambda_invocation_count = asg_state_change_hook_nested_stack.lambda_asg_state_change_hook.metric_invocations(
unit=cloudwatch.Unit.COUNT,
)

### There's a bug rn where you can't create blank widgets,
# So TMP create a blank metric to attach to them:
# BUG: https://github.com/aws/aws-cdk/issues/31393
# DOCS: https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html
blank_metric = cloudwatch.Metric(
metric_name="blank",
namespace="blank",
period=Duration.minutes(1),
statistic="Maximum",
## ASG Traffic In/Out:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html
traffic_in_metric = cloudwatch.Metric(
label="Network In",
metric_name="NetworkIn",
namespace="AWS/EC2",
dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name},
)
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html
traffic_out_metric = cloudwatch.Metric(
label="Network Out",
metric_name="NetworkOut",
namespace="AWS/EC2",
dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name},
)
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html
total_traffic_metric = cloudwatch.MathExpression(
label="Total Network Traffic",
expression="t_in + t_out",
using_metrics={
"t_in": traffic_in_metric,
"t_out": traffic_out_metric,
},
)

# "Namespace" the widgets. All the leaf stacks will need to access them, but
# I don't want to have a ton of widgets directly in "self". Plus now we can
# loop over the dict to add to the dashboard instead of adding each one manually.
self.widgets = {
### ASG Traffic PACKETS In/Out Widget:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html
traffic_packets_in_metric = cloudwatch.Metric(
label="Network Packets In",
metric_name="NetworkPacketsIn",
namespace="AWS/EC2",
dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name},
)
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html
traffic_packets_out_metric = cloudwatch.Metric(
label="Network Packets Out",
metric_name="NetworkPacketsOut",
namespace="AWS/EC2",
dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name},
)
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html
total_packets_metric = cloudwatch.MathExpression(
label="Total Packets Traffic",
expression="t_p_in + t_p_out",
using_metrics={
"t_p_in": traffic_packets_in_metric,
"t_p_out": traffic_packets_out_metric,
},
)

## EC2 Service Metrics:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrcpuwbrutilizationprops
cpu_utilization_metric = ecs_asg_nested_stack.ec2_service.metric_cpu_utilization()
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrmemorywbrutilizationprops
memory_utilization_metric = ecs_asg_nested_stack.ec2_service.metric_memory_utilization()

############
### Widgets Here. The order here is how they'll appear in the dashboard.
dashboard_widgets = [

## Route53 DNS logs for spinning up the system:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html
cloudwatch.LogQueryWidget(
title="DNS Traffic - Hook to Start Up System",
log_group_names=[domain_stack.route53_query_log_group.log_group_name],
region=domain_stack.region,
width=12,
query_lines=[
"fields @message",
# Spaces on either side, just like SubscriptionFilter, to not
# trigger on the "_tcp" query that pairs with the normal one:
f"filter @message like /{domain_stack.log_dns_filter}/",
],
),

## Lambda Invocation count for after AWS State Changes
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html
"AutoScalingGroup-Traffic": cloudwatch.GraphWidget(
height=8,
cloudwatch.GraphWidget(
title="(Lambda) ASG State Change Invocations",
# Only show up to an hour ago:
start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M",
height=6,
width=12,
left=[blank_metric],
right=[metric_asg_lambda_invocation_count],
legend_position=cloudwatch.LegendPosition.RIGHT,
period=Duration.minutes(1),
statistic="Sum",
),

## Show Instances, to easily see when it starts/stops.
# Should only ever be 0 or 1, but this widget displays that the best.
cloudwatch.SingleValueWidget(
title="Instance Count",
width=3,
height=4,
metrics=[watchdog_nested_stack.metric_asg_num_instances],
),
}
# Add the widgets to the dashboard:
for widget in self.widgets.values():
self.dashboard.add_widgets(widget)

## Brief summary of all the alarms, and lets you jump to them directly:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmStatusWidget.html
cloudwatch.AlarmStatusWidget(
title=f"Alarm Summary ({container_id})",
width=3,
height=4,
alarms=[
watchdog_nested_stack.alarm_container_activity,
watchdog_nested_stack.alarm_watchdog_errors,
watchdog_nested_stack.alarm_asg_instance_left_up,
],
),

## Container Activity Alarm:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
cloudwatch.AlarmWidget(
title=f"Alarm: {watchdog_nested_stack.alarm_container_activity.alarm_name}",
width=6,
height=4,
alarm=watchdog_nested_stack.alarm_container_activity,
),

### All the ASG Traffic in/out
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html
cloudwatch.GraphWidget(
title="(ASG) All Network Traffic",
# Only show up to an hour ago:
start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M",
height=6,
width=12,
left=[traffic_packets_in_metric, traffic_packets_out_metric, total_packets_metric],
right=[traffic_in_metric, traffic_out_metric, total_traffic_metric],
legend_position=cloudwatch.LegendPosition.RIGHT,
period=Duration.minutes(1),
statistic="Sum",
## Left and Right Y-Axis:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.YAxisProps.html
# Because of the MetricMath in the graph, units are unknown anyways:
left_y_axis=cloudwatch.YAxisProps(label="Traffic Packets", show_units=False),
right_y_axis=cloudwatch.YAxisProps(label="Traffic Amount", show_units=False),
),

## Instance Left Up Alarm:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
cloudwatch.AlarmWidget(
title=f"Alarm: {watchdog_nested_stack.alarm_asg_instance_left_up.alarm_name}",
width=6,
height=4,
alarm=watchdog_nested_stack.alarm_asg_instance_left_up,
),

## WatchDog Errors Alarm:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
cloudwatch.AlarmWidget(
title=f"Alarm: {watchdog_nested_stack.alarm_watchdog_errors.alarm_name}",
width=6,
height=4,
alarm=watchdog_nested_stack.alarm_watchdog_errors,
),

## Show the Container Logs:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html
cloudwatch.LogQueryWidget(
title="Container Logs",
log_group_names=[container_nested_stack.container_log_group.log_group_name],
width=12,
query_lines=[
"fields @message",
],
),

## ECS Container Utilization:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html
cloudwatch.GraphWidget(
title=f"(ECS) Container Utilization - {main_config["Ec2"]["InstanceType"]}",
# Only show up to an hour ago:
start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M",
height=6,
width=12,
right=[cpu_utilization_metric, memory_utilization_metric],
# But have both keys in the same spot, on the right:
legend_position=cloudwatch.LegendPosition.RIGHT,
period=Duration.minutes(1),
statistic="Maximum",
),

]

############
### Finally create the Dashboard itself:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Dashboard.html
self.dashboard = cloudwatch.Dashboard(
self,
"CloudwatchDashboard",
dashboard_name=f"{application_id}-{container_id}-Dashboard",
period_override=cloudwatch.PeriodOverride.AUTO,
default_interval=dashboard_config["IntervalMinutes"],
widgets=[dashboard_widgets],
)
Loading

0 comments on commit ba87d02

Please sign in to comment.