From 199d39f2cc7b84eac520f67240da3b5f7f7edea3 Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Thu, 26 Sep 2024 15:38:55 -0800 Subject: [PATCH 01/12] Cleaned up the pipeline, it worked --- .github/workflows/main-pipeline-cdk.yml | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main-pipeline-cdk.yml b/.github/workflows/main-pipeline-cdk.yml index 63027cc..35c805e 100644 --- a/.github/workflows/main-pipeline-cdk.yml +++ b/.github/workflows/main-pipeline-cdk.yml @@ -19,7 +19,10 @@ on: branches: - main - +#### Overall Strategy: +# If Open a PR: Make sure everything can Synth (No deploy) +# If Merge a PR: Deploy to your account (No synth, it just passed) +# If workflow_dispatch: Synth first, then deploy if pass env: EXAMPLES_PATH: ./Examples # No trailing slash plz! @@ -40,7 +43,7 @@ jobs: cdk-synth: if: ( github.event_name == 'pull_request' && github.event.action != 'closed' ) || - ( github.event_name == 'workflow_dispatch') + ( github.event_name == 'workflow_dispatch' ) runs-on: ubuntu-latest needs: - setup-matrix @@ -67,17 +70,12 @@ jobs: cdk-deploy: ## If the PR is merged, or if we manually trigger it (MAIN ONLY): - # !failure() && !cancelled(): Let it check the 'if' block here, to see if the last job was just skipped. + # !failure() && !cancelled(): Let it check the 'if' block here, even if the last job skipped. if: | !failure() && !cancelled() && ( - ( github.event_name == 'pull_request' && github.event.pull_request.merged && needs.cdk-synth.result == 'skipped' ) || + ( github.event_name == 'pull_request' && github.event.pull_request.merged ) || ( github.event_name == 'workflow_dispatch' ) ) - - # if: !failure() && !cancelled() && ( - # ( github.event_name == 'pull_request' && github.event.pull_request.merged && needs.cdk-synth.result == 'skipped') || - # ( github.event_name == 'workflow_dispatch' ) - # ) runs-on: ubuntu-latest needs: - setup-matrix From b0c158e35f98c7e8364902ab9c4f11ff25374de4 Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 17:22:33 -0800 Subject: [PATCH 02/12] Make variables you're not supposed to touch, more obvious they're private --- Makefile | 30 +++++++++++++++--------------- app.py | 5 +++-- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 6a4ccc3..7ea681d 100644 --- a/Makefile +++ b/Makefile @@ -9,14 +9,14 @@ MAKEFLAGS += --no-print-directory # Do here instead of the cdk app, so they're not duplicated in both and # avoid getting out of sync. Just pass them in maturity ?= prod -# The application_id and base_stack_name are only here to have in one place, -# they're not meant to be modified directly: +# The _application_id and _base_stack_name are only here to have in one place, +# THEY'RE NOT MEANT TO BE MODIFIED: ifeq ($(maturity),prod) - application_id := "ContainerManager" + _application_id := "ContainerManager" else - application_id := "ContainerManager-$(maturity)" + _application_id := "ContainerManager-$(maturity)" endif -base_stack_name := "$(application_id)-BaseStack" +_base_stack_name := "$(_application_id)-BaseStack" ## Make sure any required env-var's are set (i.e with guard-STACK_NAME) guard-%: @@ -40,23 +40,23 @@ _cdk-deploy-helper: guard-stack-regix # empty config-file is okay here cdk deploy "$(stack-regix)" \ --require-approval never \ --no-previous-parameters \ + --context _application_id="$(_application_id)" \ + --context _base_stack_name="$(_base_stack_name)" \ --context config-file="$(config-file)" \ --context maturity="$(maturity)" \ - --context application_id="$(application_id)" \ - --context base_stack_name="$(base_stack_name)" \ --context container-id="$(container-id)" echo "Finished at: `date +'%-I:%M%P (%Ss)'`" # Edit the base stack: .PHONY := cdk-deploy-base cdk-deploy-base: - $(MAKE) _cdk-deploy-helper stack-regix="$(base_stack_name)" + $(MAKE) _cdk-deploy-helper stack-regix="$(_base_stack_name)" # Edit everything BUT the base stack (within the config-file scope): .PHONY := cdk-deploy-leaf cdk-deploy-leaf: guard-config-file echo "Config File: $(config-file)" - $(MAKE) _cdk-deploy-helper stack-regix="!$(base_stack_name)" + $(MAKE) _cdk-deploy-helper stack-regix="!$(_base_stack_name)" @@ -69,23 +69,23 @@ _cdk-destroy-helper: guard-stack-regix # empty config-file is okay here echo "" cdk destroy "$(stack-regix)" \ --force \ + --context _application_id="$(_application_id)" \ + --context _base_stack_name="$(_base_stack_name)" \ --context config-file="$(config-file)" \ --context maturity="$(maturity)" \ - --context application_id="$(application_id)" \ - --context base_stack_name="$(base_stack_name)" \ --context container-id="$(container-id)" echo "Finished at: `date +'%-I:%M%P (%Ss)'`" # Destroy the base stack .PHONY := cdk-destroy-base cdk-destroy-base: - $(MAKE) _cdk-destroy-helper stack-regix="$(base_stack_name)" + $(MAKE) _cdk-destroy-helper stack-regix="$(_base_stack_name)" # Destroy the leaf stack inside the config-file .PHONY := cdk-destroy-leaf cdk-destroy-leaf: guard-config-file echo "Config File: $(config-file)" - $(MAKE) _cdk-destroy-helper stack-regix="!$(base_stack_name)" + $(MAKE) _cdk-destroy-helper stack-regix="!$(_base_stack_name)" ######################## @@ -110,10 +110,10 @@ cdk-synth: echo "Synthesizing Stack..." echo "" cdk synth \ + --context _application_id="$(_application_id)" \ + --context _base_stack_name="$(_base_stack_name)" \ --context config-file="$(config-file)" \ --context maturity="$(maturity)" \ - --context application_id="$(application_id)" \ - --context base_stack_name="$(base_stack_name)" \ --context container-id="$(container-id)" \ $(STACKS) diff --git a/app.py b/app.py index 46767ac..6e5fa31 100644 --- a/app.py +++ b/app.py @@ -23,7 +23,7 @@ # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.App.html app = App() -application_id = app.node.get_context("application_id") +application_id = app.node.get_context("_application_id") APPLICATION_ID_TAG_NAME = "ApplicationId" ### TODO: Finish going through all the cdk_nag checks: # Aspects.of(app).add(cdk_nag.AwsSolutionsChecks(verbose=True)) @@ -44,7 +44,7 @@ base_config = load_base_config("./base-stack-config.yaml") base_stack = ContainerManagerBaseStack( app, - app.node.get_context("base_stack_name"), + app.node.get_context("_base_stack_name"), description="The base VPC for all other ContainerManage stacks to use.", cross_region_references=True, env=main_env, @@ -88,6 +88,7 @@ env=main_env, base_stack=base_stack, domain_stack=domain_stack, + application_id=application_id, container_id=container_id, config=leaf_config, ) From 465c1c09d1854ea91aaaffeb76b4ea14ed857cce Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 17:23:20 -0800 Subject: [PATCH 03/12] Added a bunch of metric support. Had to do one dashboard per leaf stack, since most widgets don't support multiple metrics --- .../NestedStacks/AsgStateChangeHook.py | 23 +++++ .../leaf_stack/NestedStacks/Container.py | 15 ++++ .../leaf_stack/NestedStacks/Dashboard.py | 57 +++++++------ .../leaf_stack/NestedStacks/EcsAsg.py | 83 ++++++++++++++++--- .../leaf_stack/NestedStacks/Watchdog.py | 55 ++++++++---- .../leaf_stack/link_together_stack.py | 3 +- ContainerManager/leaf_stack/main.py | 57 ++++++++----- 7 files changed, 214 insertions(+), 79 deletions(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py index e9361c9..955459a 100644 --- a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py +++ b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py @@ -12,6 +12,7 @@ aws_logs as logs, aws_ecs as ecs, aws_events as events, + aws_cloudwatch as cloudwatch, aws_events_targets as events_targets, aws_autoscaling as autoscaling, ) @@ -35,6 +36,7 @@ def __init__( ec2_service: ecs.Ec2Service, auto_scaling_group: autoscaling.AutoScalingGroup, rule_watchdog_trigger: events.Rule, + dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs, ) -> None: super().__init__(scope, "AsgStateChangeHook", **kwargs) @@ -163,6 +165,27 @@ def __init__( ], ) + ####################### + ### Dashboard Stuff ### + ####################### + ### Add asg_state_change_hook's invocations to the dashboard: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_lambda.Function.html#metricwbrinvocationsprops + metric_state_change_invocations = self.lambda_asg_state_change_hook.metric_invocations() + ## Graph it: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + widget_state_change_invocations = cloudwatch.GraphWidget( + title="(Lambda) ASG State Change Invocations", + # Only show up to an hour ago: + start="-PT1H", + height=6, + width=12, + right=[metric_state_change_invocations], + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Sum", + ) + dashboard_widgets.append((0, widget_state_change_invocations)) + ##################### ### cdk_nag stuff ### ##################### diff --git a/ContainerManager/leaf_stack/NestedStacks/Container.py b/ContainerManager/leaf_stack/NestedStacks/Container.py index e844765..2caa573 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Container.py +++ b/ContainerManager/leaf_stack/NestedStacks/Container.py @@ -9,6 +9,7 @@ RemovalPolicy, aws_ecs as ecs, aws_logs as logs, + aws_cloudwatch as cloudwatch, ) from constructs import Construct @@ -26,6 +27,7 @@ def __init__( leaf_construct_id: str, container_id: str, container_config: dict, + dashboard_widgets: list, **kwargs ) -> None: super().__init__(scope, "ContainerNestedStack", **kwargs) @@ -72,3 +74,16 @@ def __init__( log_group=self.container_log_group, ), ) + + ####################### + ### Dashboard stuff ### + ####################### + container_logs_widget = cloudwatch.LogQueryWidget( + title="Container Logs", + log_group_names=[self.container_log_group.log_group_name], + width=12, + query_lines=[ + "fields @message", + ], + ) + dashboard_widgets.append((0, container_logs_widget)) diff --git a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py index 437762f..b316e38 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py +++ b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py @@ -22,14 +22,15 @@ def __init__( self, scope: Construct, leaf_construct_id: str, + dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], + application_id: str, + container_id: str, + route53_dns_log_group_name: str, + route53_dns_region: str, + route53_dns_sub_domain_name: str, **kwargs ) -> None: super().__init__(scope, "DashboardNestedStack", **kwargs) - ## YES This isn't the "optimum" way if the dashboard is in the same - # stack as everything it's watching. Once the bug listed above is fixed, - # I want to move this nested stack to the base_stack, and have ALL the - # leaf_stacks push to the same dashboard. That way you can see how they - # compare to one another easily too. ####################### ### Dashboard stuff ### @@ -38,32 +39,30 @@ def __init__( self.dashboard = cloudwatch.Dashboard( self, "CloudwatchDashboard", - dashboard_name=f"{leaf_construct_id}-dashboard", + dashboard_name=f"{application_id}-{container_id}-Dashboard", period_override=cloudwatch.PeriodOverride.AUTO, + default_interval=Duration.hours(1), ) - ### There's a bug rn where you can't create blank widgets, - # So TMP create a blank metric to attach to them: - # BUG: https://github.com/aws/aws-cdk/issues/31393 - # DOCS: https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html - blank_metric = cloudwatch.Metric( - metric_name="blank", - namespace="blank", - period=Duration.minutes(1), - statistic="Maximum", + ############# + ### Widgets for data *OUTSIDE* of this Main Managing Stack: + + ## Route53 DNS logs for spinning up the system: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html + dns_logs_widget = cloudwatch.LogQueryWidget( + title="DNS Traffic - Hook to Start Up System", + log_group_names=[route53_dns_log_group_name], + region=route53_dns_region, + width=12, + query_lines=[ + "fields @message", + # Spaces on either side, just like SubscriptionFilter, to not + # trigger on the "_tcp" query that pairs with the normal one: + f"filter @message like / {route53_dns_sub_domain_name} /", + ], ) + dashboard_widgets.append((0, dns_logs_widget)) - # "Namespace" the widgets. All the leaf stacks will need to access them, but - # I don't want to have a ton of widgets directly in "self". Plus now we can - # loop over the dict to add to the dashboard instead of adding each one manually. - self.widgets = { - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html - "AutoScalingGroup-Traffic": cloudwatch.GraphWidget( - height=8, - width=12, - left=[blank_metric], - ), - } - # Add the widgets to the dashboard: - for widget in self.widgets.values(): - self.dashboard.add_widgets(widget) + ### Add the widgets to the dashboard: + widgets = [widget for _, widget in sorted(dashboard_widgets, key=lambda x: x[0])] + self.dashboard.add_widgets(*widgets) diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index 1e71db6..88eb0b3 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -40,7 +40,7 @@ def __init__( sg_container_traffic: ec2.SecurityGroup, efs_file_system: efs.FileSystem, host_access_point: efs.AccessPoint, - dashboard_widgets: dict, + dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs, ) -> None: super().__init__(scope, "EcsAsgNestedStack", **kwargs) @@ -160,8 +160,11 @@ def __init__( self, "AsgCapacityProvider", auto_scaling_group=self.auto_scaling_group, - # To let me delete the stack!!: + ## To let me delete the stack!!: enable_managed_termination_protection=False, + ## Since the instances don't live long, this doesn't do anything, and + # the system is trying to spin down twice when going down. + enable_managed_draining=False, ) self.ecs_cluster.add_asg_capacity_provider(self.capacity_provider) ## Just to populate information in the console, doesn't change the logic: @@ -255,36 +258,92 @@ def __init__( ####################### ### Dashboard Stuff ### ####################### + ### Traffic In/Out Widget: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html traffic_in_metric = cloudwatch.Metric( + label="Network In", metric_name="NetworkIn", namespace="AWS/EC2", dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - period=Duration.minutes(1), - statistic="Sum", ) - dashboard_widgets["AutoScalingGroup-Traffic"].add_right_metric(traffic_in_metric) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html traffic_out_metric = cloudwatch.Metric( + label="Network Out", metric_name="NetworkOut", namespace="AWS/EC2", dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - period=Duration.minutes(1), - statistic="Sum", ) - dashboard_widgets["AutoScalingGroup-Traffic"].add_right_metric(traffic_out_metric) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html total_traffic_metric = cloudwatch.MathExpression( + label="Total Network Traffic", expression="t_in + t_out", - label=f"Total Traffic: {leaf_construct_id}", using_metrics={ "t_in": traffic_in_metric, "t_out": traffic_out_metric, }, ) - dashboard_widgets["AutoScalingGroup-Traffic"].add_right_metric(total_traffic_metric) + + ### Traffic PACKETS In/Out Widget: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_packets_in_metric = cloudwatch.Metric( + label="Network Packets In", + metric_name="NetworkPacketsIn", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_packets_out_metric = cloudwatch.Metric( + label="Network Packets Out", + metric_name="NetworkPacketsOut", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html + total_packets_metric = cloudwatch.MathExpression( + label="Total Packets Traffic", + expression="t_p_in + t_p_out", + using_metrics={ + "t_p_in": traffic_packets_in_metric, + "t_p_out": traffic_packets_out_metric, + }, + ) + + ### Put them all in a GraphWidget: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + traffic_widget = cloudwatch.GraphWidget( + title="(ASG) All Network Traffic", + # Only show up to an hour ago: + start="-PT1H", + height=6, + width=12, + left=[traffic_packets_in_metric, traffic_packets_out_metric, total_packets_metric], + right=[traffic_in_metric, traffic_out_metric, total_traffic_metric], + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Sum", + ) + dashboard_widgets.append((0, traffic_widget)) + + ### ECS Info Widget: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrcpuwbrutilizationprops + cpu_utilization_metric = self.ec2_service.metric_cpu_utilization() + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrmemorywbrutilizationprops + memory_utilization_metric = self.ec2_service.metric_memory_utilization() + ### Put them all in a GraphWidget: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + ecs_container_graph = cloudwatch.GraphWidget( + title="(ECS) Container Utilization", + # Only show up to an hour ago: + start="-PT1H", + height=6, + width=12, + right=[cpu_utilization_metric, memory_utilization_metric], + # But have both keys in the same spot, on the right: + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Maximum", + ) + dashboard_widgets.append((0, ecs_container_graph)) ##################### ### cdk_nag stuff ### diff --git a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py index 4583f24..a09df2f 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py +++ b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py @@ -35,8 +35,7 @@ def __init__( task_definition: ecs.Ec2TaskDefinition, auto_scaling_group: autoscaling.AutoScalingGroup, base_stack_sns_topic: sns.Topic, - dashboard: cloudwatch.Dashboard, - dashboard_widgets: dict, + dashboard_widgets: list, **kwargs, ) -> None: super().__init__(scope, "WatchdogNestedStack", **kwargs) @@ -70,8 +69,8 @@ def __init__( self.alarm_asg_instance_left_up = self.metric_asg_num_instances.create_alarm( self, "AlarmInstanceLeftUp", - alarm_name=f"{leaf_construct_id}-Alarm-Instance-left-up", - alarm_description="To warn if the instance is up too long", + alarm_name=f"Instance Left Up ({leaf_construct_id})", + alarm_description=f"({leaf_construct_id}) To warn if the instance is up too long", ### This way if the period changes, this will stay the same duration: # Total Duration = Number of Periods * Period length... so # Number of Periods = Total Duration / Period length @@ -92,7 +91,6 @@ def __init__( cloudwatch_actions.AutoScalingAction(self.scale_down_asg_action) ) - ############################# ## Count Connections Logic ## ############################# @@ -157,8 +155,8 @@ def __init__( self.alarm_container_activity = self.metric_total_activity.create_alarm( self, "AlarmContainerActivity", - alarm_name=f"{leaf_construct_id}-Alarm-ContainerActivity", - alarm_description="Trigger if 0 people are connected for too long", + alarm_name=f"Container Activity ({leaf_construct_id})", + alarm_description=f"({leaf_construct_id}) Trigger if 0 people are connected for too long", evaluation_periods=evaluation_periods, threshold=0, comparison_operator=cloudwatch.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, @@ -261,8 +259,8 @@ def __init__( self.alarm_watchdog_errors = self.metric_watchdog_errors.create_alarm( self, "AlarmWatchdogErrors", - alarm_name=f"{leaf_construct_id}-Alarm-Watchdog-Errors", - alarm_description="Trigger if the Lambda Watchdog fails too many times", + alarm_name=f"Watchdog Errors ({leaf_construct_id})", + alarm_description=f"({leaf_construct_id}) Trigger if the Lambda Watchdog fails too many times", # Must be in alarm this long consecutively to trigger. 3 strikes you're out: # (Duration doesn't matter here, no need to divide by metric period. We ALWAYS want 3) evaluation_periods=3, @@ -297,17 +295,40 @@ def __init__( ####################### ### Dashboard stuff ### ####################### - + graph_alarms = [ + ## Have to keep 'priority' here so each alarm can + # have a different priority on the dashboard. + # (They have different numbers, so they stack on one-another. + # Otherwise they'd be on different rows). + (0, self.alarm_container_activity), + (0, self.alarm_watchdog_errors), + (0, self.alarm_asg_instance_left_up), + ] ## You can't append alarms to *this* widget after it's created, so I'm just having one per stack: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmStatusWidget.html alarm_status_widget = cloudwatch.AlarmStatusWidget( - title=f"Alarm Status: {leaf_construct_id}", + title=f"Alarm Summary ({container_id})", width=6, height=4, - alarms=[ - self.alarm_container_activity, - self.alarm_watchdog_errors, - self.alarm_asg_instance_left_up, - ] + alarms=[alarm[1] for alarm in graph_alarms], + ) + dashboard_widgets.append((0, alarm_status_widget)) + + for priority, alarm in graph_alarms: + alarm_widget = cloudwatch.AlarmWidget( + title=f"Alarm: {alarm.alarm_name}", + width=6, + height=4, + alarm=alarm, + ) + dashboard_widgets.append((priority, alarm_widget)) + + ## Add the number of Instances, to easily see when it starts/stops. + # Should only ever be 0 or 1, but this widget displays that the best. + num_instances_widget = cloudwatch.SingleValueWidget( + title="Number of Instances", + width=3, + height=4, + metrics=[self.metric_asg_num_instances], ) - dashboard.add_widgets(alarm_status_widget) + dashboard_widgets.append((0, num_instances_widget)) diff --git a/ContainerManager/leaf_stack/link_together_stack.py b/ContainerManager/leaf_stack/link_together_stack.py index d0d9dec..28ab248 100644 --- a/ContainerManager/leaf_stack/link_together_stack.py +++ b/ContainerManager/leaf_stack/link_together_stack.py @@ -102,7 +102,8 @@ def __init__( "SubscriptionFilter", log_group=domain_stack.route53_query_log_group, destination=logs_destinations.LambdaDestination(self.lambda_start_system), - filter_pattern=logs.FilterPattern.any_term(domain_stack.sub_domain_name), + # Spaces on either side, so it doesn't match the "_tcp" query that pairs with it: + filter_pattern=logs.FilterPattern.any_term(f" {domain_stack.sub_domain_name} "), filter_name="TriggerLambdaOnConnect", ) diff --git a/ContainerManager/leaf_stack/main.py b/ContainerManager/leaf_stack/main.py index 1972d70..99b2d0d 100644 --- a/ContainerManager/leaf_stack/main.py +++ b/ContainerManager/leaf_stack/main.py @@ -41,25 +41,25 @@ def get_logical_id(self, element): return super().get_logical_id(element) def __init__( - self, - scope: Construct, - construct_id: str, - base_stack: ContainerManagerBaseStack, - domain_stack: DomainStack, - container_id: str, - config: dict, - **kwargs - ) -> None: + self, + scope: Construct, + construct_id: str, + base_stack: ContainerManagerBaseStack, + domain_stack: DomainStack, + application_id: str, + container_id: str, + config: dict, + **kwargs + ) -> None: super().__init__(scope, construct_id, **kwargs) - ####################### - ### Dashboard Stuff ### - ####################### - self.dashboard_nested_stack = NestedStacks.Dashboard( - self, - description=f"Dashboard Logic for {construct_id}", - leaf_construct_id=construct_id, - ) + ###################### + ### Dashboard Prep ### + ###################### + ### LOWEST priority number is FIRST on the dashboard.) + ### Append to this in the form of: + # (priority: int, widget: cloudwatch.IWidget) + dashboard_widgets = [] ############################### @@ -101,6 +101,7 @@ def __init__( leaf_construct_id=construct_id, container_id=container_id, container_config=config["Container"], + dashboard_widgets=dashboard_widgets, ) ### All the info for EFS Stuff @@ -130,7 +131,7 @@ def __init__( sg_container_traffic=self.sg_nested_stack.sg_container_traffic, efs_file_system=self.efs_nested_stack.efs_file_system, host_access_point=self.efs_nested_stack.host_access_point, - dashboard_widgets=self.dashboard_nested_stack.widgets, + dashboard_widgets=dashboard_widgets, ) ### All the info for the Watchdog Stuff @@ -143,8 +144,7 @@ def __init__( task_definition=self.container_nested_stack.task_definition, auto_scaling_group=self.ecs_asg_nested_stack.auto_scaling_group, base_stack_sns_topic=base_stack.sns_notify_topic, - dashboard=self.dashboard_nested_stack.dashboard, - dashboard_widgets=self.dashboard_nested_stack.widgets, + dashboard_widgets=dashboard_widgets, ) ### All the info for the Asg StateChange Hook Stuff @@ -157,6 +157,23 @@ def __init__( ec2_service=self.ecs_asg_nested_stack.ec2_service, auto_scaling_group=self.ecs_asg_nested_stack.auto_scaling_group, rule_watchdog_trigger=self.watchdog_nested_stack.rule_watchdog_trigger, + dashboard_widgets=dashboard_widgets, + ) + + ####################### + ### Dashboard Stuff ### + ####################### + self.dashboard_nested_stack = NestedStacks.Dashboard( + self, + description=f"Dashboard Logic for {construct_id}", + leaf_construct_id=construct_id, + dashboard_widgets=dashboard_widgets, + application_id=application_id, + container_id=container_id, + # For creating other widgets info: + route53_dns_log_group_name=domain_stack.route53_query_log_group.log_group_name, + route53_dns_region=domain_stack.region, + route53_dns_sub_domain_name=domain_stack.sub_domain_name, ) ##################### From 2f39443ce7fbcad1e5fed41087ca682823f1803e Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 17:34:10 -0800 Subject: [PATCH 04/12] Added more-correct type hint to dashboard_widgets --- ContainerManager/leaf_stack/NestedStacks/Container.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/Container.py b/ContainerManager/leaf_stack/NestedStacks/Container.py index 2caa573..c616064 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Container.py +++ b/ContainerManager/leaf_stack/NestedStacks/Container.py @@ -27,7 +27,7 @@ def __init__( leaf_construct_id: str, container_id: str, container_config: dict, - dashboard_widgets: list, + dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs ) -> None: super().__init__(scope, "ContainerNestedStack", **kwargs) From bf037225148baa8dfffe99d8ccdfc3a0c83fcabb Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 18:15:50 -0800 Subject: [PATCH 05/12] Ordered the dashboard widgets to be orginized --- .../leaf_stack/NestedStacks/AsgStateChangeHook.py | 2 +- .../leaf_stack/NestedStacks/Container.py | 2 +- .../leaf_stack/NestedStacks/Dashboard.py | 2 +- ContainerManager/leaf_stack/NestedStacks/EcsAsg.py | 4 ++-- ContainerManager/leaf_stack/NestedStacks/Watchdog.py | 12 ++++++------ 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py index 955459a..d3e10d8 100644 --- a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py +++ b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py @@ -184,7 +184,7 @@ def __init__( period=Duration.minutes(1), statistic="Sum", ) - dashboard_widgets.append((0, widget_state_change_invocations)) + dashboard_widgets.append((2, widget_state_change_invocations)) ##################### ### cdk_nag stuff ### diff --git a/ContainerManager/leaf_stack/NestedStacks/Container.py b/ContainerManager/leaf_stack/NestedStacks/Container.py index c616064..21fc701 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Container.py +++ b/ContainerManager/leaf_stack/NestedStacks/Container.py @@ -86,4 +86,4 @@ def __init__( "fields @message", ], ) - dashboard_widgets.append((0, container_logs_widget)) + dashboard_widgets.append((9, container_logs_widget)) diff --git a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py index b316e38..81d85e3 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py +++ b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py @@ -61,7 +61,7 @@ def __init__( f"filter @message like / {route53_dns_sub_domain_name} /", ], ) - dashboard_widgets.append((0, dns_logs_widget)) + dashboard_widgets.append((1, dns_logs_widget)) ### Add the widgets to the dashboard: widgets = [widget for _, widget in sorted(dashboard_widgets, key=lambda x: x[0])] diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index 88eb0b3..8456bb5 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -322,7 +322,7 @@ def __init__( period=Duration.minutes(1), statistic="Sum", ) - dashboard_widgets.append((0, traffic_widget)) + dashboard_widgets.append((6, traffic_widget)) ### ECS Info Widget: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrcpuwbrutilizationprops @@ -343,7 +343,7 @@ def __init__( period=Duration.minutes(1), statistic="Maximum", ) - dashboard_widgets.append((0, ecs_container_graph)) + dashboard_widgets.append((10, ecs_container_graph)) ##################### ### cdk_nag stuff ### diff --git a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py index a09df2f..b24a6a9 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py +++ b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py @@ -300,19 +300,19 @@ def __init__( # have a different priority on the dashboard. # (They have different numbers, so they stack on one-another. # Otherwise they'd be on different rows). - (0, self.alarm_container_activity), - (0, self.alarm_watchdog_errors), - (0, self.alarm_asg_instance_left_up), + (5, self.alarm_container_activity), + (8, self.alarm_watchdog_errors), + (7, self.alarm_asg_instance_left_up), ] ## You can't append alarms to *this* widget after it's created, so I'm just having one per stack: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmStatusWidget.html alarm_status_widget = cloudwatch.AlarmStatusWidget( title=f"Alarm Summary ({container_id})", - width=6, + width=3, height=4, alarms=[alarm[1] for alarm in graph_alarms], ) - dashboard_widgets.append((0, alarm_status_widget)) + dashboard_widgets.append((4, alarm_status_widget)) for priority, alarm in graph_alarms: alarm_widget = cloudwatch.AlarmWidget( @@ -331,4 +331,4 @@ def __init__( height=4, metrics=[self.metric_asg_num_instances], ) - dashboard_widgets.append((0, num_instances_widget)) + dashboard_widgets.append((3, num_instances_widget)) From 59fae3108015127631a0539578d404b54436cc6f Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 18:58:56 -0800 Subject: [PATCH 06/12] Adding units to a graph --- .../leaf_stack/NestedStacks/AsgStateChangeHook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py index d3e10d8..8afe7b3 100644 --- a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py +++ b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py @@ -170,7 +170,9 @@ def __init__( ####################### ### Add asg_state_change_hook's invocations to the dashboard: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_lambda.Function.html#metricwbrinvocationsprops - metric_state_change_invocations = self.lambda_asg_state_change_hook.metric_invocations() + metric_state_change_invocations = self.lambda_asg_state_change_hook.metric_invocations( + unit=cloudwatch.Unit.COUNT, + ) ## Graph it: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html widget_state_change_invocations = cloudwatch.GraphWidget( From 925a24d937e5448009b1b3c8ea1c4d579eacfd0d Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 19:24:46 -0800 Subject: [PATCH 07/12] Other minor tweaks for the dashboard --- ContainerManager/leaf_stack/NestedStacks/EcsAsg.py | 5 +++++ ContainerManager/leaf_stack/NestedStacks/Watchdog.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index 8456bb5..fd90706 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -321,6 +321,11 @@ def __init__( legend_position=cloudwatch.LegendPosition.RIGHT, period=Duration.minutes(1), statistic="Sum", + ## Left and Right Y-Axis: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.YAxisProps.html + # Because of the MetricMath in the graph, units are unknown anyways: + left_y_axis=cloudwatch.YAxisProps(label="Traffic Packets", show_units=False), + right_y_axis=cloudwatch.YAxisProps(label="Traffic Amount", show_units=False), ) dashboard_widgets.append((6, traffic_widget)) diff --git a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py index b24a6a9..340d026 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py +++ b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py @@ -326,7 +326,7 @@ def __init__( ## Add the number of Instances, to easily see when it starts/stops. # Should only ever be 0 or 1, but this widget displays that the best. num_instances_widget = cloudwatch.SingleValueWidget( - title="Number of Instances", + title="Instance Count", width=3, height=4, metrics=[self.metric_asg_num_instances], From 71e6c05a1e6d818000cadd94f3c051402e7d283b Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sat, 12 Oct 2024 19:35:56 -0800 Subject: [PATCH 08/12] Other minor tweaks for the dashboard --- ContainerManager/leaf_stack/NestedStacks/EcsAsg.py | 2 +- ContainerManager/leaf_stack/NestedStacks/Watchdog.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index fd90706..d3f67fb 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -163,7 +163,7 @@ def __init__( ## To let me delete the stack!!: enable_managed_termination_protection=False, ## Since the instances don't live long, this doesn't do anything, and - # the system is trying to spin down twice when going down. + # the lambda to spin down the system will trigger TWICE when going down. enable_managed_draining=False, ) self.ecs_cluster.add_asg_capacity_provider(self.capacity_provider) diff --git a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py index 340d026..c738b27 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py +++ b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py @@ -70,7 +70,7 @@ def __init__( self, "AlarmInstanceLeftUp", alarm_name=f"Instance Left Up ({leaf_construct_id})", - alarm_description=f"({leaf_construct_id}) To warn if the instance is up too long", + alarm_description="To warn if the instance is up too long", ### This way if the period changes, this will stay the same duration: # Total Duration = Number of Periods * Period length... so # Number of Periods = Total Duration / Period length @@ -156,7 +156,7 @@ def __init__( self, "AlarmContainerActivity", alarm_name=f"Container Activity ({leaf_construct_id})", - alarm_description=f"({leaf_construct_id}) Trigger if 0 people are connected for too long", + alarm_description="Trigger if 0 people are connected for too long", evaluation_periods=evaluation_periods, threshold=0, comparison_operator=cloudwatch.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, @@ -260,7 +260,7 @@ def __init__( self, "AlarmWatchdogErrors", alarm_name=f"Watchdog Errors ({leaf_construct_id})", - alarm_description=f"({leaf_construct_id}) Trigger if the Lambda Watchdog fails too many times", + alarm_description="Trigger if the Lambda Watchdog fails too many times", # Must be in alarm this long consecutively to trigger. 3 strikes you're out: # (Duration doesn't matter here, no need to divide by metric period. We ALWAYS want 3) evaluation_periods=3, From 62f6d2d1485bd91eb593f7e1e9b2db0204dde7b7 Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sun, 13 Oct 2024 09:58:13 -0800 Subject: [PATCH 09/12] Moved all the widgets to the Dashboard NestedStack. It keeps the other nested stacks smaller, and it's easier to use shared variabels across the metrics/widgets/dashboard now. --- .../NestedStacks/AsgStateChangeHook.py | 24 -- .../leaf_stack/NestedStacks/Container.py | 14 - .../leaf_stack/NestedStacks/Dashboard.py | 239 +++++++++++++++--- .../leaf_stack/NestedStacks/EcsAsg.py | 102 +------- .../leaf_stack/NestedStacks/Watchdog.py | 46 +--- ContainerManager/leaf_stack/domain_stack.py | 3 + .../leaf_stack/link_together_stack.py | 2 +- ContainerManager/leaf_stack/main.py | 26 +- ContainerManager/utils/config_loader.py | 17 ++ Examples/Minecraft-example.yaml | 3 + 10 files changed, 246 insertions(+), 230 deletions(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py index 8afe7b3..0fb167a 100644 --- a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py +++ b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py @@ -36,7 +36,6 @@ def __init__( ec2_service: ecs.Ec2Service, auto_scaling_group: autoscaling.AutoScalingGroup, rule_watchdog_trigger: events.Rule, - dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs, ) -> None: super().__init__(scope, "AsgStateChangeHook", **kwargs) @@ -165,29 +164,6 @@ def __init__( ], ) - ####################### - ### Dashboard Stuff ### - ####################### - ### Add asg_state_change_hook's invocations to the dashboard: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_lambda.Function.html#metricwbrinvocationsprops - metric_state_change_invocations = self.lambda_asg_state_change_hook.metric_invocations( - unit=cloudwatch.Unit.COUNT, - ) - ## Graph it: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html - widget_state_change_invocations = cloudwatch.GraphWidget( - title="(Lambda) ASG State Change Invocations", - # Only show up to an hour ago: - start="-PT1H", - height=6, - width=12, - right=[metric_state_change_invocations], - legend_position=cloudwatch.LegendPosition.RIGHT, - period=Duration.minutes(1), - statistic="Sum", - ) - dashboard_widgets.append((2, widget_state_change_invocations)) - ##################### ### cdk_nag stuff ### ##################### diff --git a/ContainerManager/leaf_stack/NestedStacks/Container.py b/ContainerManager/leaf_stack/NestedStacks/Container.py index 21fc701..374f3bd 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Container.py +++ b/ContainerManager/leaf_stack/NestedStacks/Container.py @@ -27,7 +27,6 @@ def __init__( leaf_construct_id: str, container_id: str, container_config: dict, - dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs ) -> None: super().__init__(scope, "ContainerNestedStack", **kwargs) @@ -74,16 +73,3 @@ def __init__( log_group=self.container_log_group, ), ) - - ####################### - ### Dashboard stuff ### - ####################### - container_logs_widget = cloudwatch.LogQueryWidget( - title="Container Logs", - log_group_names=[self.container_log_group.log_group_name], - width=12, - query_lines=[ - "fields @message", - ], - ) - dashboard_widgets.append((9, container_logs_widget)) diff --git a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py index 81d85e3..d60a3eb 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py +++ b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py @@ -10,6 +10,10 @@ ) from constructs import Construct +from ContainerManager.leaf_stack.domain_stack import DomainStack +## Import the other Nested Stacks: +from . import Container, EcsAsg, Watchdog, AsgStateChangeHook + ### Nested Stack info: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.NestedStack.html class Dashboard(NestedStack): @@ -21,13 +25,15 @@ class Dashboard(NestedStack): def __init__( self, scope: Construct, - leaf_construct_id: str, - dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], application_id: str, container_id: str, - route53_dns_log_group_name: str, - route53_dns_region: str, - route53_dns_sub_domain_name: str, + volume_config: dict, + + domain_stack: DomainStack, + container_nested_stack: Container, + ecs_asg_nested_stack: EcsAsg, + watchdog_nested_stack: Watchdog, + asg_state_change_hook_nested_stack: AsgStateChangeHook, **kwargs ) -> None: super().__init__(scope, "DashboardNestedStack", **kwargs) @@ -35,34 +41,209 @@ def __init__( ####################### ### Dashboard stuff ### ####################### + + ############ + ### Metrics used in the Widgets below: + + ## ASG State Change Invocation Count: + metric_asg_lambda_invocation_count = asg_state_change_hook_nested_stack.lambda_asg_state_change_hook.metric_invocations( + unit=cloudwatch.Unit.COUNT, + ) + + ## ASG Traffic In/Out: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_in_metric = cloudwatch.Metric( + label="Network In", + metric_name="NetworkIn", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_out_metric = cloudwatch.Metric( + label="Network Out", + metric_name="NetworkOut", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html + total_traffic_metric = cloudwatch.MathExpression( + label="Total Network Traffic", + expression="t_in + t_out", + using_metrics={ + "t_in": traffic_in_metric, + "t_out": traffic_out_metric, + }, + ) + + ### ASG Traffic PACKETS In/Out Widget: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_packets_in_metric = cloudwatch.Metric( + label="Network Packets In", + metric_name="NetworkPacketsIn", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html + traffic_packets_out_metric = cloudwatch.Metric( + label="Network Packets Out", + metric_name="NetworkPacketsOut", + namespace="AWS/EC2", + dimensions_map={"AutoScalingGroupName": ecs_asg_nested_stack.auto_scaling_group.auto_scaling_group_name}, + ) + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html + total_packets_metric = cloudwatch.MathExpression( + label="Total Packets Traffic", + expression="t_p_in + t_p_out", + using_metrics={ + "t_p_in": traffic_packets_in_metric, + "t_p_out": traffic_packets_out_metric, + }, + ) + + ## EC2 Service Metrics: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrcpuwbrutilizationprops + cpu_utilization_metric = ecs_asg_nested_stack.ec2_service.metric_cpu_utilization() + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrmemorywbrutilizationprops + memory_utilization_metric = ecs_asg_nested_stack.ec2_service.metric_memory_utilization() + + ############ + ### Widgets Here. The order here is how they'll appear in the dashboard. + dashboard_widgets = [ + + ## Route53 DNS logs for spinning up the system: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html + cloudwatch.LogQueryWidget( + title="DNS Traffic - Hook to Start Up System", + log_group_names=[domain_stack.route53_query_log_group.log_group_name], + region=domain_stack.region, + width=12, + query_lines=[ + "fields @message", + # Spaces on either side, just like SubscriptionFilter, to not + # trigger on the "_tcp" query that pairs with the normal one: + f"filter @message like /{domain_stack.log_dns_filter}/", + ], + ), + + ## Lambda Invocation count for after AWS State Changes + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + cloudwatch.GraphWidget( + title="(Lambda) ASG State Change Invocations", + # Only show up to an hour ago: + start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + height=6, + width=12, + right=[metric_asg_lambda_invocation_count], + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Sum", + ), + + ## Show Instances, to easily see when it starts/stops. + # Should only ever be 0 or 1, but this widget displays that the best. + cloudwatch.SingleValueWidget( + title="Instance Count", + width=3, + height=4, + metrics=[watchdog_nested_stack.metric_asg_num_instances], + ), + + ## Brief summary of all the alarms, and lets you jump to them directly: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmStatusWidget.html + cloudwatch.AlarmStatusWidget( + title=f"Alarm Summary ({container_id})", + width=3, + height=4, + alarms=[ + watchdog_nested_stack.alarm_container_activity, + watchdog_nested_stack.alarm_watchdog_errors, + watchdog_nested_stack.alarm_asg_instance_left_up, + ], + ), + + ## Container Activity Alarm: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html + cloudwatch.AlarmWidget( + title=f"Alarm: {watchdog_nested_stack.alarm_container_activity.alarm_name}", + width=6, + height=4, + alarm=watchdog_nested_stack.alarm_container_activity, + ), + + ### All the ASG Traffic in/out + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + cloudwatch.GraphWidget( + title="(ASG) All Network Traffic", + # Only show up to an hour ago: + start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + height=6, + width=12, + left=[traffic_packets_in_metric, traffic_packets_out_metric, total_packets_metric], + right=[traffic_in_metric, traffic_out_metric, total_traffic_metric], + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Sum", + ## Left and Right Y-Axis: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.YAxisProps.html + # Because of the MetricMath in the graph, units are unknown anyways: + left_y_axis=cloudwatch.YAxisProps(label="Traffic Packets", show_units=False), + right_y_axis=cloudwatch.YAxisProps(label="Traffic Amount", show_units=False), + ), + + ## Instance Left Up Alarm: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html + cloudwatch.AlarmWidget( + title=f"Alarm: {watchdog_nested_stack.alarm_asg_instance_left_up.alarm_name}", + width=6, + height=4, + alarm=watchdog_nested_stack.alarm_asg_instance_left_up, + ), + + ## WatchDog Errors Alarm: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html + cloudwatch.AlarmWidget( + title=f"Alarm: {watchdog_nested_stack.alarm_watchdog_errors.alarm_name}", + width=6, + height=4, + alarm=watchdog_nested_stack.alarm_watchdog_errors, + ), + + ## Show the Container Logs: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html + cloudwatch.LogQueryWidget( + title="Container Logs", + log_group_names=[container_nested_stack.container_log_group.log_group_name], + width=12, + query_lines=[ + "fields @message", + ], + ), + + ## ECS Container Utilization: + # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html + cloudwatch.GraphWidget( + title=f"(ECS) Container Utilization - {ecs_asg_nested_stack.instance_type}", + # Only show up to an hour ago: + start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + height=6, + width=12, + right=[cpu_utilization_metric, memory_utilization_metric], + # But have both keys in the same spot, on the right: + legend_position=cloudwatch.LegendPosition.RIGHT, + period=Duration.minutes(1), + statistic="Maximum", + ), + + ] + + ############ + ### Finally create the Dashboard itself: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Dashboard.html self.dashboard = cloudwatch.Dashboard( self, "CloudwatchDashboard", dashboard_name=f"{application_id}-{container_id}-Dashboard", period_override=cloudwatch.PeriodOverride.AUTO, - default_interval=Duration.hours(1), + default_interval=volume_config["IntervalMinutes"], + widgets=[dashboard_widgets], ) - - ############# - ### Widgets for data *OUTSIDE* of this Main Managing Stack: - - ## Route53 DNS logs for spinning up the system: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html - dns_logs_widget = cloudwatch.LogQueryWidget( - title="DNS Traffic - Hook to Start Up System", - log_group_names=[route53_dns_log_group_name], - region=route53_dns_region, - width=12, - query_lines=[ - "fields @message", - # Spaces on either side, just like SubscriptionFilter, to not - # trigger on the "_tcp" query that pairs with the normal one: - f"filter @message like / {route53_dns_sub_domain_name} /", - ], - ) - dashboard_widgets.append((1, dns_logs_widget)) - - ### Add the widgets to the dashboard: - widgets = [widget for _, widget in sorted(dashboard_widgets, key=lambda x: x[0])] - self.dashboard.add_widgets(*widgets) diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index d3f67fb..9138fd9 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -40,11 +40,12 @@ def __init__( sg_container_traffic: ec2.SecurityGroup, efs_file_system: efs.FileSystem, host_access_point: efs.AccessPoint, - dashboard_widgets: list[tuple[int, cloudwatch.IWidget]], **kwargs, ) -> None: super().__init__(scope, "EcsAsgNestedStack", **kwargs) + ## The instance type to use for the container: + self.instance_type = ec2_config["InstanceType"] ## Cluster for the the container # This has to stay in this stack. A cluster represents a single "instance type" @@ -57,7 +58,6 @@ def __init__( vpc=vpc, ) - ## Permissions for inside the instance/host of the container: self.ec2_role = iam.Role( self, @@ -111,7 +111,7 @@ def __init__( self.launch_template = ec2.LaunchTemplate( self, "LaunchTemplate", - instance_type=ec2.InstanceType(ec2_config["InstanceType"]), + instance_type=ec2.InstanceType(self.instance_type), ## Needs to be an "EcsOptimized" image to register to the cluster # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.EcsOptimizedImage.html machine_image=ecs.EcsOptimizedImage.amazon_linux2023(), @@ -125,7 +125,6 @@ def __init__( require_imdsv2=True, ) - ## A Fleet represents a managed set of EC2 instances: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_autoscaling.AutoScalingGroup.html self.auto_scaling_group = autoscaling.AutoScalingGroup( @@ -255,101 +254,6 @@ def __init__( ], ) - ####################### - ### Dashboard Stuff ### - ####################### - ### Traffic In/Out Widget: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html - traffic_in_metric = cloudwatch.Metric( - label="Network In", - metric_name="NetworkIn", - namespace="AWS/EC2", - dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - ) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html - traffic_out_metric = cloudwatch.Metric( - label="Network Out", - metric_name="NetworkOut", - namespace="AWS/EC2", - dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - ) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html - total_traffic_metric = cloudwatch.MathExpression( - label="Total Network Traffic", - expression="t_in + t_out", - using_metrics={ - "t_in": traffic_in_metric, - "t_out": traffic_out_metric, - }, - ) - - ### Traffic PACKETS In/Out Widget: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html - traffic_packets_in_metric = cloudwatch.Metric( - label="Network Packets In", - metric_name="NetworkPacketsIn", - namespace="AWS/EC2", - dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - ) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html - traffic_packets_out_metric = cloudwatch.Metric( - label="Network Packets Out", - metric_name="NetworkPacketsOut", - namespace="AWS/EC2", - dimensions_map={"AutoScalingGroupName": self.auto_scaling_group.auto_scaling_group_name}, - ) - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.MathExpression.html - total_packets_metric = cloudwatch.MathExpression( - label="Total Packets Traffic", - expression="t_p_in + t_p_out", - using_metrics={ - "t_p_in": traffic_packets_in_metric, - "t_p_out": traffic_packets_out_metric, - }, - ) - - ### Put them all in a GraphWidget: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html - traffic_widget = cloudwatch.GraphWidget( - title="(ASG) All Network Traffic", - # Only show up to an hour ago: - start="-PT1H", - height=6, - width=12, - left=[traffic_packets_in_metric, traffic_packets_out_metric, total_packets_metric], - right=[traffic_in_metric, traffic_out_metric, total_traffic_metric], - legend_position=cloudwatch.LegendPosition.RIGHT, - period=Duration.minutes(1), - statistic="Sum", - ## Left and Right Y-Axis: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.YAxisProps.html - # Because of the MetricMath in the graph, units are unknown anyways: - left_y_axis=cloudwatch.YAxisProps(label="Traffic Packets", show_units=False), - right_y_axis=cloudwatch.YAxisProps(label="Traffic Amount", show_units=False), - ) - dashboard_widgets.append((6, traffic_widget)) - - ### ECS Info Widget: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrcpuwbrutilizationprops - cpu_utilization_metric = self.ec2_service.metric_cpu_utilization() - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.Ec2Service.html#metricwbrmemorywbrutilizationprops - memory_utilization_metric = self.ec2_service.metric_memory_utilization() - ### Put them all in a GraphWidget: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html - ecs_container_graph = cloudwatch.GraphWidget( - title="(ECS) Container Utilization", - # Only show up to an hour ago: - start="-PT1H", - height=6, - width=12, - right=[cpu_utilization_metric, memory_utilization_metric], - # But have both keys in the same spot, on the right: - legend_position=cloudwatch.LegendPosition.RIGHT, - period=Duration.minutes(1), - statistic="Maximum", - ) - dashboard_widgets.append((10, ecs_container_graph)) - ##################### ### cdk_nag stuff ### ##################### diff --git a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py index c738b27..c17f23d 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Watchdog.py +++ b/ContainerManager/leaf_stack/NestedStacks/Watchdog.py @@ -35,7 +35,6 @@ def __init__( task_definition: ecs.Ec2TaskDefinition, auto_scaling_group: autoscaling.AutoScalingGroup, base_stack_sns_topic: sns.Topic, - dashboard_widgets: list, **kwargs, ) -> None: super().__init__(scope, "WatchdogNestedStack", **kwargs) @@ -65,7 +64,7 @@ def __init__( ) ## And the alarm to flag if the instance is up too long: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Alarm.html - duration_before_alarm = Duration.hours(watchdog_config["InstanceLeftUp"]["DurationHours"]).to_minutes() + duration_before_alarm = watchdog_config["InstanceLeftUp"]["DurationHours"].to_minutes() self.alarm_asg_instance_left_up = self.metric_asg_num_instances.create_alarm( self, "AlarmInstanceLeftUp", @@ -151,7 +150,7 @@ def __init__( # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.Metric.html#createwbralarmscope-id-props # Total Duration = Number of Periods * Period length... so # Number of Periods = Total Duration / Period length - evaluation_periods = int(watchdog_config["MinutesWithoutConnections"] / self.metric_total_activity.period.to_minutes()) + evaluation_periods = int(watchdog_config["MinutesWithoutConnections"].to_minutes() / self.metric_total_activity.period.to_minutes()) self.alarm_container_activity = self.metric_total_activity.create_alarm( self, "AlarmContainerActivity", @@ -291,44 +290,3 @@ def __init__( # Start disabled, self.lambda_watchdog_container_activity will enable it when instance starts up enabled=False, ) - - ####################### - ### Dashboard stuff ### - ####################### - graph_alarms = [ - ## Have to keep 'priority' here so each alarm can - # have a different priority on the dashboard. - # (They have different numbers, so they stack on one-another. - # Otherwise they'd be on different rows). - (5, self.alarm_container_activity), - (8, self.alarm_watchdog_errors), - (7, self.alarm_asg_instance_left_up), - ] - ## You can't append alarms to *this* widget after it's created, so I'm just having one per stack: - # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmStatusWidget.html - alarm_status_widget = cloudwatch.AlarmStatusWidget( - title=f"Alarm Summary ({container_id})", - width=3, - height=4, - alarms=[alarm[1] for alarm in graph_alarms], - ) - dashboard_widgets.append((4, alarm_status_widget)) - - for priority, alarm in graph_alarms: - alarm_widget = cloudwatch.AlarmWidget( - title=f"Alarm: {alarm.alarm_name}", - width=6, - height=4, - alarm=alarm, - ) - dashboard_widgets.append((priority, alarm_widget)) - - ## Add the number of Instances, to easily see when it starts/stops. - # Should only ever be 0 or 1, but this widget displays that the best. - num_instances_widget = cloudwatch.SingleValueWidget( - title="Instance Count", - width=3, - height=4, - metrics=[self.metric_asg_num_instances], - ) - dashboard_widgets.append((3, num_instances_widget)) diff --git a/ContainerManager/leaf_stack/domain_stack.py b/ContainerManager/leaf_stack/domain_stack.py index f4411a7..f053f98 100644 --- a/ContainerManager/leaf_stack/domain_stack.py +++ b/ContainerManager/leaf_stack/domain_stack.py @@ -37,6 +37,9 @@ def __init__( self.dns_ttl = 1 self.record_type = route53.RecordType.A self.sub_domain_name = f"{container_id}.{base_stack.root_hosted_zone.zone_name}".lower() + # Spaces on the ends to not match sub-domains like "_tcp.*" that shows up in logs. + # The record_type is because BOTH A and AAAA appear, even if my ISP only supports one. + self.log_dns_filter = f" {self.sub_domain_name} {self.record_type.value} " ## Log group for the Route53 DNS logs: self.route53_query_log_group = logs.LogGroup( diff --git a/ContainerManager/leaf_stack/link_together_stack.py b/ContainerManager/leaf_stack/link_together_stack.py index 28ab248..5323f0f 100644 --- a/ContainerManager/leaf_stack/link_together_stack.py +++ b/ContainerManager/leaf_stack/link_together_stack.py @@ -103,7 +103,7 @@ def __init__( log_group=domain_stack.route53_query_log_group, destination=logs_destinations.LambdaDestination(self.lambda_start_system), # Spaces on either side, so it doesn't match the "_tcp" query that pairs with it: - filter_pattern=logs.FilterPattern.any_term(f" {domain_stack.sub_domain_name} "), + filter_pattern=logs.FilterPattern.any_term(domain_stack.log_dns_filter), filter_name="TriggerLambdaOnConnect", ) diff --git a/ContainerManager/leaf_stack/main.py b/ContainerManager/leaf_stack/main.py index 99b2d0d..5c76c37 100644 --- a/ContainerManager/leaf_stack/main.py +++ b/ContainerManager/leaf_stack/main.py @@ -53,15 +53,6 @@ def __init__( ) -> None: super().__init__(scope, construct_id, **kwargs) - ###################### - ### Dashboard Prep ### - ###################### - ### LOWEST priority number is FIRST on the dashboard.) - ### Append to this in the form of: - # (priority: int, widget: cloudwatch.IWidget) - dashboard_widgets = [] - - ############################### ## Container-specific Notify ## ############################### @@ -101,7 +92,6 @@ def __init__( leaf_construct_id=construct_id, container_id=container_id, container_config=config["Container"], - dashboard_widgets=dashboard_widgets, ) ### All the info for EFS Stuff @@ -131,7 +121,6 @@ def __init__( sg_container_traffic=self.sg_nested_stack.sg_container_traffic, efs_file_system=self.efs_nested_stack.efs_file_system, host_access_point=self.efs_nested_stack.host_access_point, - dashboard_widgets=dashboard_widgets, ) ### All the info for the Watchdog Stuff @@ -144,7 +133,6 @@ def __init__( task_definition=self.container_nested_stack.task_definition, auto_scaling_group=self.ecs_asg_nested_stack.auto_scaling_group, base_stack_sns_topic=base_stack.sns_notify_topic, - dashboard_widgets=dashboard_widgets, ) ### All the info for the Asg StateChange Hook Stuff @@ -157,7 +145,6 @@ def __init__( ec2_service=self.ecs_asg_nested_stack.ec2_service, auto_scaling_group=self.ecs_asg_nested_stack.auto_scaling_group, rule_watchdog_trigger=self.watchdog_nested_stack.rule_watchdog_trigger, - dashboard_widgets=dashboard_widgets, ) ####################### @@ -166,14 +153,15 @@ def __init__( self.dashboard_nested_stack = NestedStacks.Dashboard( self, description=f"Dashboard Logic for {construct_id}", - leaf_construct_id=construct_id, - dashboard_widgets=dashboard_widgets, application_id=application_id, container_id=container_id, - # For creating other widgets info: - route53_dns_log_group_name=domain_stack.route53_query_log_group.log_group_name, - route53_dns_region=domain_stack.region, - route53_dns_sub_domain_name=domain_stack.sub_domain_name, + volume_config=config["Dashboard"], + + domain_stack=domain_stack, + container_nested_stack=self.container_nested_stack, + ecs_asg_nested_stack=self.ecs_asg_nested_stack, + watchdog_nested_stack=self.watchdog_nested_stack, + asg_state_change_hook_nested_stack=self.asg_state_change_hook_nested_stack, ) ##################### diff --git a/ContainerManager/utils/config_loader.py b/ContainerManager/utils/config_loader.py index d0db8c0..9640caf 100644 --- a/ContainerManager/utils/config_loader.py +++ b/ContainerManager/utils/config_loader.py @@ -13,6 +13,7 @@ # to a base class. from aws_cdk import ( + Duration, aws_sns as sns, aws_ecs as ecs, ) @@ -201,6 +202,8 @@ def _parse_watchdog_minutes_without_connections(config: dict) -> None: config["Watchdog"]["MinutesWithoutConnections"] = 5 assert isinstance(config["Watchdog"]["MinutesWithoutConnections"], int) assert config["Watchdog"]["MinutesWithoutConnections"] >= 2, "Watchdog.MinutesWithoutConnections must be at least 2." + # Cast it into a duration object: + config["Watchdog"]["MinutesWithoutConnections"] = Duration.minutes(config["Watchdog"]["MinutesWithoutConnections"]) def _parse_watchdog_threshold(config: dict) -> None: if "Threshold" not in config["Watchdog"]: @@ -223,6 +226,9 @@ def _parse_watchdog_instance_left_up(config: dict) -> None: if "DurationHours" not in config["Watchdog"]["InstanceLeftUp"]: config["Watchdog"]["InstanceLeftUp"]["DurationHours"] = 8 assert isinstance(config["Watchdog"]["InstanceLeftUp"]["DurationHours"], int) + assert config["Watchdog"]["InstanceLeftUp"]["DurationHours"] > 0, "Watchdog.InstanceLeftUp.DurationHours must be greater than 0." + # Cast it into a duration object: + config["Watchdog"]["InstanceLeftUp"]["DurationHours"] = Duration.hours(config["Watchdog"]["InstanceLeftUp"]["DurationHours"]) # ShouldStop if "ShouldStop" not in config["Watchdog"]["InstanceLeftUp"]: config["Watchdog"]["InstanceLeftUp"]["ShouldStop"] = False @@ -245,6 +251,16 @@ def _parse_watchdog_instance_left_up(config: dict) -> None: ### InstanceLeftUp Block _parse_watchdog_instance_left_up(config) +def _parse_dashboard(config: dict) -> None: + if "Dashboard" not in config: + config["Dashboard"] = {} + assert isinstance(config["Dashboard"], dict) + if "IntervalMinutes" not in config["Dashboard"]: + config["Dashboard"]["IntervalMinutes"] = 30 + assert isinstance(config["Dashboard"]["IntervalMinutes"], int) + assert config["Dashboard"]["IntervalMinutes"] > 0, "Dashboard.IntervalMinutes must be greater than 0." + # Cast it into a duration object: + config["Dashboard"]["IntervalMinutes"] = Duration.minutes(config["Dashboard"]["IntervalMinutes"]) def load_leaf_config(path: str, maturity: str) -> dict: " Parser/Loader for all leaf stacks " @@ -254,4 +270,5 @@ def load_leaf_config(path: str, maturity: str) -> dict: _parse_ec2(config) _parse_watchdog(config) _parse_sns(config) + _parse_dashboard(config) return config diff --git a/Examples/Minecraft-example.yaml b/Examples/Minecraft-example.yaml index a43d163..82adea3 100644 --- a/Examples/Minecraft-example.yaml +++ b/Examples/Minecraft-example.yaml @@ -28,3 +28,6 @@ Volume: AlertSubscription: - Email: Does@Not.Exist + +Dashboard: + IntervalMinutes: 30 From 7a806c59399fff4ae632ef2e141a0d3ef19fd1b5 Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sun, 13 Oct 2024 10:14:43 -0800 Subject: [PATCH 10/12] pylint recommended changes --- ContainerManager/base_stack.py | 3 --- ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py | 3 +-- ContainerManager/leaf_stack/NestedStacks/Container.py | 1 - ContainerManager/leaf_stack/NestedStacks/EcsAsg.py | 2 -- ContainerManager/leaf_stack/main.py | 2 -- 5 files changed, 1 insertion(+), 10 deletions(-) diff --git a/ContainerManager/base_stack.py b/ContainerManager/base_stack.py index dcb3d00..6bce595 100644 --- a/ContainerManager/base_stack.py +++ b/ContainerManager/base_stack.py @@ -6,14 +6,11 @@ from constructs import Construct from aws_cdk import ( Stack, - Tags, - Duration, RemovalPolicy, aws_ec2 as ec2, aws_route53 as route53, aws_sns as sns, aws_iam as iam, - aws_cloudwatch as cloudwatch, ) from cdk_nag import NagSuppressions diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py index 0fb167a..8e01e1f 100644 --- a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py +++ b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py @@ -12,7 +12,6 @@ aws_logs as logs, aws_ecs as ecs, aws_events as events, - aws_cloudwatch as cloudwatch, aws_events_targets as events_targets, aws_autoscaling as autoscaling, ) @@ -179,4 +178,4 @@ def __init__( } ], apply_to_children=True, - ) \ No newline at end of file + ) diff --git a/ContainerManager/leaf_stack/NestedStacks/Container.py b/ContainerManager/leaf_stack/NestedStacks/Container.py index 374f3bd..e844765 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Container.py +++ b/ContainerManager/leaf_stack/NestedStacks/Container.py @@ -9,7 +9,6 @@ RemovalPolicy, aws_ecs as ecs, aws_logs as logs, - aws_cloudwatch as cloudwatch, ) from constructs import Construct diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index 9138fd9..16b865f 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -5,7 +5,6 @@ from aws_cdk import ( NestedStack, - Duration, aws_ec2 as ec2, aws_ecs as ecs, aws_iam as iam, @@ -14,7 +13,6 @@ aws_events as events, aws_events_targets as events_targets, aws_autoscaling as autoscaling, - aws_cloudwatch as cloudwatch, ) from constructs import Construct diff --git a/ContainerManager/leaf_stack/main.py b/ContainerManager/leaf_stack/main.py index 5c76c37..441adf9 100644 --- a/ContainerManager/leaf_stack/main.py +++ b/ContainerManager/leaf_stack/main.py @@ -7,9 +7,7 @@ from aws_cdk import ( Stack, - Duration, aws_sns as sns, - aws_kms as kms, ) from constructs import Construct from cdk_nag import NagSuppressions From 7071148e92e200b83a7083991eaddf9075baa0dd Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sun, 13 Oct 2024 10:47:43 -0800 Subject: [PATCH 11/12] Added more-clear comments --- .github/workflows/main-pipeline-cdk.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main-pipeline-cdk.yml b/.github/workflows/main-pipeline-cdk.yml index 35c805e..aa354bd 100644 --- a/.github/workflows/main-pipeline-cdk.yml +++ b/.github/workflows/main-pipeline-cdk.yml @@ -70,7 +70,8 @@ jobs: cdk-deploy: ## If the PR is merged, or if we manually trigger it (MAIN ONLY): - # !failure() && !cancelled(): Let it check the 'if' block here, even if the last job skipped. + # !failure() && !cancelled(): Let it check the 'if' block here, EVEN if cdk-synth skipped. + # (it'll skip when PR is merged. No point in synthing again since it has to pass TO merge) if: | !failure() && !cancelled() && ( ( github.event_name == 'pull_request' && github.event.pull_request.merged ) || @@ -82,7 +83,7 @@ jobs: - cdk-synth steps: # Moved this check away from the job-level 'if', so it actually fails if you're - # not on main and it's easier to see. + # not on main. Failure is easier to see than a skipped job. - name: Fail if not 'main' (else skips this step) # Yes the pull_request line is redundant with the 'branch' trigger at the top of the # file, but better safe than sorry. Keeps the logic uniform with workflow_dispatch too. From 8f1bd4b4ca23e7291dfa1418011732cd47252c74 Mon Sep 17 00:00:00 2001 From: Cameron Showalter Date: Sun, 13 Oct 2024 11:02:57 -0800 Subject: [PATCH 12/12] Made the dashboard grab the instance type from the config direclty, so it doesn't depend on a variable inside teh Container stack --- .../leaf_stack/NestedStacks/Dashboard.py | 14 ++++++++------ ContainerManager/leaf_stack/NestedStacks/EcsAsg.py | 5 +---- ContainerManager/leaf_stack/main.py | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py index d60a3eb..dff2d82 100644 --- a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py +++ b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py @@ -27,7 +27,7 @@ def __init__( scope: Construct, application_id: str, container_id: str, - volume_config: dict, + main_config: dict, domain_stack: DomainStack, container_nested_stack: Container, @@ -41,6 +41,8 @@ def __init__( ####################### ### Dashboard stuff ### ####################### + # Config options for specifically this stack: + dashboard_config = main_config["Dashboard"] ############ ### Metrics used in the Widgets below: @@ -130,7 +132,7 @@ def __init__( cloudwatch.GraphWidget( title="(Lambda) ASG State Change Invocations", # Only show up to an hour ago: - start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M", height=6, width=12, right=[metric_asg_lambda_invocation_count], @@ -175,7 +177,7 @@ def __init__( cloudwatch.GraphWidget( title="(ASG) All Network Traffic", # Only show up to an hour ago: - start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M", height=6, width=12, left=[traffic_packets_in_metric, traffic_packets_out_metric, total_packets_metric], @@ -222,9 +224,9 @@ def __init__( ## ECS Container Utilization: # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html cloudwatch.GraphWidget( - title=f"(ECS) Container Utilization - {ecs_asg_nested_stack.instance_type}", + title=f"(ECS) Container Utilization - {main_config["Ec2"]["InstanceType"]}", # Only show up to an hour ago: - start=f"-PT{volume_config["IntervalMinutes"].to_minutes()}M", + start=f"-PT{dashboard_config["IntervalMinutes"].to_minutes()}M", height=6, width=12, right=[cpu_utilization_metric, memory_utilization_metric], @@ -244,6 +246,6 @@ def __init__( "CloudwatchDashboard", dashboard_name=f"{application_id}-{container_id}-Dashboard", period_override=cloudwatch.PeriodOverride.AUTO, - default_interval=volume_config["IntervalMinutes"], + default_interval=dashboard_config["IntervalMinutes"], widgets=[dashboard_widgets], ) diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py index 16b865f..013a9f8 100644 --- a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py +++ b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py @@ -42,9 +42,6 @@ def __init__( ) -> None: super().__init__(scope, "EcsAsgNestedStack", **kwargs) - ## The instance type to use for the container: - self.instance_type = ec2_config["InstanceType"] - ## Cluster for the the container # This has to stay in this stack. A cluster represents a single "instance type" # sort of. This is the only way to tie the ASG to the ECS Service, one-to-one. @@ -109,7 +106,7 @@ def __init__( self.launch_template = ec2.LaunchTemplate( self, "LaunchTemplate", - instance_type=ec2.InstanceType(self.instance_type), + instance_type=ec2.InstanceType(ec2_config["InstanceType"]), ## Needs to be an "EcsOptimized" image to register to the cluster # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_ecs.EcsOptimizedImage.html machine_image=ecs.EcsOptimizedImage.amazon_linux2023(), diff --git a/ContainerManager/leaf_stack/main.py b/ContainerManager/leaf_stack/main.py index 441adf9..4cca3ff 100644 --- a/ContainerManager/leaf_stack/main.py +++ b/ContainerManager/leaf_stack/main.py @@ -153,7 +153,7 @@ def __init__( description=f"Dashboard Logic for {construct_id}", application_id=application_id, container_id=container_id, - volume_config=config["Dashboard"], + main_config=config, domain_stack=domain_stack, container_nested_stack=self.container_nested_stack,