Skip to content

Commit

Permalink
SuperAgent Health Check PoC.
Browse files Browse the repository at this point in the history
  • Loading branch information
jaffinito committed Nov 22, 2024
1 parent 4634356 commit b91f8f5
Show file tree
Hide file tree
Showing 28 changed files with 639 additions and 96 deletions.
98 changes: 97 additions & 1 deletion src/Agent/NewRelic/Agent/Core/AgentHealth/AgentHealthReporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
using System.Linq;
using System.Net;
using System.Threading;
using System.IO;
using System.Security.Policy;
using System.Runtime.InteropServices;
using Grpc.Core;

namespace NewRelic.Agent.Core.AgentHealth
{
Expand All @@ -38,10 +42,27 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport
private InterlockedCounter _traceContextCreateSuccessCounter;
private InterlockedCounter _traceContextAcceptSuccessCounter;

private readonly HealthCheck _healthCheck;

public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler)
{
_healthCheck = new()
{
IsHealthy = true,
Status = "Agent starting",
LastError = string.Empty
};

_metricBuilder = metricBuilder;
_scheduler = scheduler;

// Want this to start immediately and write out the first health check - only if fl
if (string.IsNullOrWhiteSpace(_configuration.FleetId))
{
Log.Info(">>>>>>>>>AgentHealthReporter: Starting health check");
_scheduler.ExecuteEvery(PublishSuperAgentHealthCheck, TimeSpan.FromSeconds(_configuration.HealthFrequency), TimeSpan.Zero);
}

_scheduler.ExecuteEvery(LogPeriodicReport, _timeBetweenExecutions);
var agentHealthEvents = Enum.GetValues(typeof(AgentHealthEvent)) as AgentHealthEvent[];
foreach (var agentHealthEvent in agentHealthEvents)
Expand Down Expand Up @@ -667,6 +688,81 @@ public void ReportLogForwardingConfiguredValues()

#endregion

#region Super Agent

private void ReportIfSuperAgentHealthEnabled()
{
if (!string.IsNullOrWhiteSpace(_configuration.FleetId))
{
ReportSupportabilityCountMetric(MetricNames.SupportabilitySuperAgentHealthEnabled);
}
}

public void SetSuperAgentStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
Log.Info($">>>>>>>>>SetSuperAgentStatus code:'{healthStatus.Code}' status:'{healthStatus.Status}' params:'{(statusParams == null ? string.Empty : string.Join(",", statusParams))}'");

// Do nothing if super agent is not enabled
if (string.IsNullOrWhiteSpace(_configuration.FleetId))
{
return;
}

if (healthStatus.Equals(HealthCodes.AgentShutdownHealthy))
{
if (_healthCheck.IsHealthy)
{
_healthCheck.TrySetHealth(healthStatus);
}
}
else
{
_healthCheck.TrySetHealth(healthStatus, statusParams);
}
}

public void PublishSuperAgentHealthCheck()
{
Log.Info(">>>>>>>>>PublishSuperAgentHealthCheck");
if (string.IsNullOrWhiteSpace(_configuration.FleetId)
|| string.IsNullOrWhiteSpace(_configuration.HealthDeliveryLocation))
{
Log.Info(">>>>>>>>>PublishSuperAgentHealthCheck.NOPE");
return;
}

Log.Info(">>>>>>>>>PublishSuperAgentHealthCheck.YEP");

var fileUri = new Uri(_configuration.HealthDeliveryLocation);
if (fileUri.Scheme != Uri.UriSchemeFile)
{
Log.Debug("The provided superagent.health.delivery_location is not a file URL, skipping super agent health check: " + _configuration.HealthDeliveryLocation);
return;
}

// Ensure the path is cleaned up for Windows by removing a possible leading slash
var cleanedPath = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? fileUri.LocalPath.TrimStart('/') : fileUri.LocalPath;
if (!Directory.Exists(cleanedPath))
{
try
{
Directory.CreateDirectory(cleanedPath);
}
catch (Exception ex)
{
Log.Error(ex, "Error creating directory for super agent health check: " + cleanedPath);
return;
}
}

using (StreamWriter writer = new StreamWriter(Path.Combine(cleanedPath, _healthCheck.FileName)))
{
writer.WriteAsync(_healthCheck.ToYaml()).GetAwaiter().GetResult();
}
}

#endregion

public void ReportSupportabilityPayloadsDroppeDueToMaxPayloadSizeLimit(string endpoint)
{
TrySend(_metricBuilder.TryBuildSupportabilityPayloadsDroppedDueToMaxPayloadLimit(endpoint));
Expand All @@ -685,6 +781,7 @@ private void CollectOneTimeMetrics()
ReportIfLoggingDisabled();
ReportIfInstrumentationIsDisabled();
ReportIfGCSamplerV2IsEnabled();
ReportIfSuperAgentHealthEnabled();
}

public void CollectMetrics()
Expand Down Expand Up @@ -849,6 +946,5 @@ private void ReportIfGCSamplerV2IsEnabled()
}

}

}
}
58 changes: 58 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCheck.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

using System;
using NewRelic.Agent.Core.Utilities;

namespace NewRelic.Agent.Core.AgentHealth
{
public class HealthCheck
{
private const int NanoSecondsPerMillisecond = 1000000;

public bool IsHealthy { get; internal set; }
public string Status { get; internal set; }
public string LastError { get; internal set; }
public DateTime StartTime { get; } = DateTime.UtcNow;
public DateTime StatusTime { get; internal set; }
public string FileName { get; } = "health-" + System.Guid.NewGuid().ToString("N") + ".yml";

/// <summary>
/// Set the health status of the agent, but only update changed values.
/// </summary>
/// <param name="healthy"></param>
/// <param name="healthStatus"></param>
/// <param name="statusParams"></param>
public void TrySetHealth((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
// Threading!
if (IsHealthy != healthStatus.IsHealthy)
{
IsHealthy = healthStatus.IsHealthy;
}

if (!Status.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
if (statusParams != null && statusParams.Length > 0)
{
Status = string.Format(Status, statusParams);
}
else
{
Status = healthStatus.Status;
}
}

if (!LastError.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
LastError = healthStatus.Code;
}
}

public string ToYaml()
{
StatusTime = DateTime.UtcNow;
return $"healthy: {IsHealthy}\nstatus: {Status}\nlast_error: {LastError}\nstatus_time_unix_nano: {StatusTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}\nstart_time_unix_nano: {StartTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}";
}
}
}
84 changes: 84 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCodes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

namespace NewRelic.Agent.Core.AgentHealth
{
public static class HealthCodes
{
/// <summary>
/// Healthy
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) Healthy = (true, "NR-APM-000",
"Healthy");

/// <summary>
/// Invalid license key (HTTP status code 401)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyInvalid = (false, "NR-APM-001",
"Invalid license key (HTTP status code 401)");

/// <summary>
/// License key missing in configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyMissing = (false, "NR-APM-002",
"License key missing in configuration");

/// <summary>
/// Forced disconnect received from New Relic (HTTP status code 410)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ForceDisconnect = (false, "NR-APM-003",
"Forced disconnect received from New Relic (HTTP status code 410)");

/// <summary>
/// HTTP error response code [%s] received from New Relic while sending data type [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpError = (false, "NR-APM-004",
"HTTP error response code {0} received from New Relic while sending data type {1}");

/// <summary>
/// Missing application name in agent configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ApplicationNameMissing = (false, "NR-APM-005",
"Missing application name in agent configuration");

/// <summary>
/// The maximum number of configured app names (3) exceeded
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) MaxApplicationNamesExceeded = (false, "NR-APM-006",
"The maximum number of configured app names (3) exceeded");

/// <summary>
/// HTTP Proxy configuration error; response code [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpProxyError = (false, "NR-APM-007",
"HTTP Proxy configuration error; response code {0}");

/// <summary>
/// Agent is disabled via configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentDisabledByConfiguration = (false, "NR-APM-008",
"Agent is disabled via configuration");

/// <summary>
/// Failed to connect to New Relic data collector
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) FailedToConnect = (false, "NR-APM-009",
"Failed to connect to New Relic data collector");

/// <summary>
/// Agent has shutdown
/// Only be reported if agent is "healthy" on shutdown.
/// If the agent status is not Healthy on agent shutdown, the existing error MUST not be overwritten.
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownHealthy = (true, "NR-APM-099",
"Agent has shutdown");

// Agent health codes for the .NET agent are 200-299

/// <summary>
/// Agent has shutdown with exception [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownError = (false, "NR-APM-200",
"Agent has shutdown with exception {0}");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,7 @@ public interface IAgentHealthReporter : IOutOfBandMetricSource
void ReportLogForwardingEnabledWithFramework(string logFramework);
void ReportByteMetric(string metricName, long totalBytes, long? exclusiveBytes = null);
void ReportLoggingEventsEmpty(int count = 1);
void SetSuperAgentStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams);
void PublishSuperAgentHealthCheck();
}
}
15 changes: 13 additions & 2 deletions src/Agent/NewRelic/Agent/Core/AgentManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using NewRelic.Agent.Api;
using NewRelic.Agent.Configuration;
using NewRelic.Agent.Core.AgentHealth;
using NewRelic.Agent.Core.Commands;
using NewRelic.Agent.Core.Config;
using NewRelic.Agent.Core.Configuration;
Expand Down Expand Up @@ -81,6 +82,7 @@ public static IAgentManager Instance
private IConfiguration Configuration { get { return _configurationSubscription.Configuration; } }
private ThreadProfilingService _threadProfilingService;
private readonly IWrapperService _wrapperService;
private readonly IAgentHealthReporter _agentHealthReporter;

private volatile bool _shutdownEventReceived;
private volatile bool _isInitialized;
Expand Down Expand Up @@ -154,6 +156,9 @@ private AgentManager()
var agentApi = _container.Resolve<IAgentApi>();
_wrapperService = _container.Resolve<IWrapperService>();

// Start the AgentHealthReporter early so that we can potentially report health issues during startup
_agentHealthReporter = _container.Resolve<IAgentHealthReporter>();

// Attempt to auto start the agent once all services have resolved, except in serverless mode
if (!bootstrapConfig.ServerlessModeEnabled)
_container.Resolve<IConnectionManager>().AttemptAutoStart();
Expand Down Expand Up @@ -288,6 +293,7 @@ private void LogInitialized()
"NEW_RELIC_SEND_DATA_ON_EXIT",
"NEW_RELIC_SEND_DATA_ON_EXIT_THRESHOLD_MS",
"NEW_RELIC_AZURE_FUNCTION_MODE_ENABLED",
"NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY"
};

List<string> environmentVariablesSensitive = new List<string> {
Expand All @@ -297,7 +303,9 @@ private void LogInitialized()
"NEW_RELIC_PROXY_USER",
"NEW_RELIC_PROXY_PASS",
"NEW_RELIC_CONFIG_OBSCURING_KEY",
"NEW_RELIC_PROXY_PASS_OBFUSCATED"
"NEW_RELIC_PROXY_PASS_OBFUSCATED",
"NEW_RELIC_SUPERAGENT_FLEET_ID",
"NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION"
};

List<(string,string)> environmentVariablesDeprecated = new List<(string, string)>
Expand Down Expand Up @@ -409,7 +417,7 @@ public ITracer GetTracerImpl(string tracerFactoryName, uint tracerArguments, str
private void ProcessExit(object sender, EventArgs e)
{
Log.Debug("Received a ProcessExit CLR event for the application domain. About to shut down the .NET Agent...");

Shutdown(true);
}

Expand Down Expand Up @@ -437,13 +445,16 @@ private void Shutdown(bool cleanShutdown)

Log.Debug("Shutting down public agent services...");
StopServices();
_agentHealthReporter?.SetSuperAgentStatus(HealthCodes.AgentShutdownHealthy);
}
catch (Exception e)
{
_agentHealthReporter?.SetSuperAgentStatus(HealthCodes.AgentShutdownError, e.Message);
Log.Info(e, "Unexpected exception during agent shutdown");
}
finally
{
_agentHealthReporter?.PublishSuperAgentHealthCheck();
Log.Debug("Shutting down internal agent services...");
Dispose();

Expand Down
Loading

0 comments on commit b91f8f5

Please sign in to comment.