Skip to content

Commit

Permalink
Add NGINX reload counters (#1049)
Browse files Browse the repository at this point in the history
* Add NGINX reload counters
  • Loading branch information
ciarams87 authored Sep 19, 2023
1 parent 4c3185d commit f55c94e
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 21 deletions.
9 changes: 9 additions & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ NGINX Kubernetes Gateway exports the following metrics:
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
Gateway class of NKG. For example, `nginx_kubernetes_gateway_connections_accepted{class="nginx"}`.

- NGINX Kubernetes Gateway metrics:
- nginx_reloads_total. Number of successful NGINX reloads.
- nginx_reload_errors_total. Number of unsuccessful NGINX reloads.
- nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means
NGINX is running with a stale version.
- nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram).
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`.

- [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) metrics. These include:
- Total number of reconciliation errors per controller
- Length of reconcile queue per controller
Expand Down
47 changes: 31 additions & 16 deletions internal/mode/static/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ func StartManager(cfg config.Config) error {
return fmt.Errorf("cannot clear NGINX configuration folders: %w", err)
}

// Ensure NGINX is running before registering metrics & starting the manager.
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
return fmt.Errorf("NGINX is not running: %w", err)
}

mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName)
if err != nil {
return fmt.Errorf("cannot create and register metrics collectors: %w", err)
}

statusUpdater := status.NewUpdater(status.UpdaterConfig{
GatewayCtlrName: cfg.GatewayCtlrName,
GatewayClassName: cfg.GatewayClassName,
Expand All @@ -146,7 +156,7 @@ func StartManager(cfg config.Config) error {
cfg.Logger.WithName("nginxFileManager"),
file.NewStdLibOSFileManager(),
),
nginxRuntimeMgr: ngxruntime.NewManagerImpl(),
nginxRuntimeMgr: ngxruntime.NewManagerImpl(mgrCollector),
statusUpdater: statusUpdater,
eventRecorder: recorder,
healthChecker: hc,
Expand Down Expand Up @@ -193,17 +203,6 @@ func StartManager(cfg config.Config) error {
}
}

// Ensure NGINX is running before registering metrics & starting the manager.
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
return fmt.Errorf("NGINX is not running: %w", err)
}

if cfg.MetricsConfig.Enabled {
if err := configureNginxMetrics(cfg.GatewayClassName); err != nil {
return err
}
}

cfg.Logger.Info("Starting manager")
return mgr.Start(ctx)
}
Expand Down Expand Up @@ -353,13 +352,29 @@ func setInitialConfig(
return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter)
}

func configureNginxMetrics(gatewayClassName string) error {
constLabels := map[string]string{"class": gatewayClassName}
// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them,
// and returns the runtime manager collector to be used in the nginxRuntimeMgr.
func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) {
if !metricsEnabled {
// return a no-op collector to avoid nil pointer errors when metrics are disabled
return nkgmetrics.NewManagerNoopCollector(), nil
}
constLabels := map[string]string{"class": gwClassName}

ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels)
if err != nil {
return fmt.Errorf("cannot get NGINX metrics: %w", err)
return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err)
}
if err := metrics.Registry.Register(ngxCollector); err != nil {
return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err)
}
return metrics.Registry.Register(ngxCollector)

mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels)
if err := metrics.Registry.Register(mgrCollector); err != nil {
return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err)
}

return mgrCollector, nil
}

func getMetricsOptions(cfg config.MetricsConfig) metricsserver.Options {
Expand Down
115 changes: 115 additions & 0 deletions internal/mode/static/metrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"
)

// ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface
type ManagerMetricsCollector struct {
// Metrics
reloadsTotal prometheus.Counter
reloadsError prometheus.Counter
configStale prometheus.Gauge
reloadsDuration prometheus.Histogram
}

// NewManagerMetricsCollector creates a new ManagerMetricsCollector
func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCollector {
nc := &ManagerMetricsCollector{
reloadsTotal: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "nginx_reloads_total",
Namespace: metricsNamespace,
Help: "Number of successful NGINX reloads",
ConstLabels: constLabels,
}),
reloadsError: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "nginx_reload_errors_total",
Namespace: metricsNamespace,
Help: "Number of unsuccessful NGINX reloads",
ConstLabels: constLabels,
},
),
configStale: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "nginx_stale_config",
Namespace: metricsNamespace,
Help: "Indicates if NGINX is not serving the latest configuration.",
ConstLabels: constLabels,
},
),
reloadsDuration: prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "nginx_reloads_milliseconds",
Namespace: metricsNamespace,
Help: "Duration in milliseconds of NGINX reloads",
ConstLabels: constLabels,
Buckets: []float64{500, 1000, 5000, 10000, 30000},
},
),
}
return nc
}

// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false.
func (mc *ManagerMetricsCollector) IncReloadCount() {
mc.reloadsTotal.Inc()
mc.updateConfigStaleStatus(false)
}

// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true.
func (mc *ManagerMetricsCollector) IncReloadErrors() {
mc.reloadsError.Inc()
mc.updateConfigStaleStatus(true)
}

// updateConfigStaleStatus updates the last NGINX reload status metric.
func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) {
var status float64
if stale {
status = 1.0
}
mc.configStale.Set(status)
}

// ObserveLastReloadTime adds the last NGINX reload time to the histogram.
func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) {
mc.reloadsDuration.Observe(float64(duration / time.Millisecond))
}

// Describe implements prometheus.Collector interface Describe method.
func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
mc.reloadsTotal.Describe(ch)
mc.reloadsError.Describe(ch)
mc.configStale.Describe(ch)
mc.reloadsDuration.Describe(ch)
}

// Collect implements the prometheus.Collector interface Collect method.
func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) {
mc.reloadsTotal.Collect(ch)
mc.reloadsError.Collect(ch)
mc.configStale.Collect(ch)
mc.reloadsDuration.Collect(ch)
}

// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface.
// Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors.
type ManagerNoopCollector struct{}

// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface.
func NewManagerNoopCollector() *ManagerNoopCollector {
return &ManagerNoopCollector{}
}

// IncReloadCount implements a no-op IncReloadCount.
func (mc *ManagerNoopCollector) IncReloadCount() {}

// IncReloadErrors implements a no-op IncReloadErrors.
func (mc *ManagerNoopCollector) IncReloadErrors() {}

// ObserveLastReloadTime implements a no-op ObserveLastReloadTime.
func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {}
4 changes: 4 additions & 0 deletions internal/mode/static/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package metrics

// nolint:gosec // flagged as potential hardcoded credentials, but is not sensitive
const metricsNamespace = "nginx_kubernetes_gateway"
2 changes: 1 addition & 1 deletion internal/mode/static/metrics/nginx.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func NewNginxMetricsCollector(constLabels map[string]string) (prometheus.Collect
if err != nil {
return nil, err
}
return nginxCollector.NewNginxCollector(client, "nginx_kubernetes_gateway", constLabels), nil
return nginxCollector.NewNginxCollector(client, metricsNamespace, constLabels), nil
}

// getSocketClient gets an http.Client with a unix socket transport.
Expand Down
28 changes: 24 additions & 4 deletions internal/mode/static/nginx/runtime/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,29 @@ type Manager interface {
Reload(ctx context.Context, configVersion int) error
}

// ManagerCollector is an interface for the metrics of the NGINX runtime manager.
type ManagerCollector interface {
IncReloadCount()
IncReloadErrors()
ObserveLastReloadTime(ms time.Duration)
}

// ManagerImpl implements Manager.
type ManagerImpl struct {
verifyClient *verifyClient
verifyClient *verifyClient
managerCollector ManagerCollector
}

// NewManagerImpl creates a new ManagerImpl.
func NewManagerImpl() *ManagerImpl {
func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl {
return &ManagerImpl{
verifyClient: newVerifyClient(nginxReloadTimeout),
verifyClient: newVerifyClient(nginxReloadTimeout),
managerCollector: managerCollector,
}
}

func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
start := time.Now()
// We find the main NGINX PID on every reload because it will change if the NGINX container is restarted.
pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout)
if err != nil {
Expand All @@ -69,6 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
// send HUP signal to the NGINX main process reload configuration
// See https://nginx.org/en/docs/control.html
if err := syscall.Kill(pid, syscall.SIGHUP); err != nil {
m.managerCollector.IncReloadErrors()
return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err)
}

Expand All @@ -79,10 +90,19 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
os.ReadFile,
childProcsTimeout,
); err != nil {
m.managerCollector.IncReloadErrors()
return fmt.Errorf(noNewWorkersErrFmt, configVersion, err)
}

return m.verifyClient.waitForCorrectVersion(ctx, configVersion)
if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil {
m.managerCollector.IncReloadErrors()
return err
}
m.managerCollector.IncReloadCount()

finish := time.Now()
m.managerCollector.ObserveLastReloadTime(finish.Sub(start))
return nil
}

// EnsureNginxRunning ensures NGINX is running by locating the main process.
Expand Down

0 comments on commit f55c94e

Please sign in to comment.