diff --git a/docs/monitoring.md b/docs/monitoring.md index 5e5862157d..1a47141a7c 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -86,6 +86,15 @@ NGINX Kubernetes Gateway exports the following metrics: - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the Gateway class of NKG. For example, `nginx_kubernetes_gateway_connections_accepted{class="nginx"}`. +- NGINX Kubernetes Gateway metrics: + - nginx_reloads_total. Number of successful NGINX reloads. + - nginx_reload_errors_total. Number of unsuccessful NGINX reloads. + - nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means + NGINX is running with a stale version. + - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram). + - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the + Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`. + - [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) metrics. These include: - Total number of reconciliation errors per controller - Length of reconcile queue per controller diff --git a/internal/mode/static/manager.go b/internal/mode/static/manager.go index a140f581d9..e84bc1404f 100644 --- a/internal/mode/static/manager.go +++ b/internal/mode/static/manager.go @@ -125,6 +125,16 @@ func StartManager(cfg config.Config) error { return fmt.Errorf("cannot clear NGINX configuration folders: %w", err) } + // Ensure NGINX is running before registering metrics & starting the manager. + if err := ngxruntime.EnsureNginxRunning(ctx); err != nil { + return fmt.Errorf("NGINX is not running: %w", err) + } + + mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName) + if err != nil { + return fmt.Errorf("cannot create and register metrics collectors: %w", err) + } + statusUpdater := status.NewUpdater(status.UpdaterConfig{ GatewayCtlrName: cfg.GatewayCtlrName, GatewayClassName: cfg.GatewayClassName, @@ -146,7 +156,7 @@ func StartManager(cfg config.Config) error { cfg.Logger.WithName("nginxFileManager"), file.NewStdLibOSFileManager(), ), - nginxRuntimeMgr: ngxruntime.NewManagerImpl(), + nginxRuntimeMgr: ngxruntime.NewManagerImpl(mgrCollector), statusUpdater: statusUpdater, eventRecorder: recorder, healthChecker: hc, @@ -193,17 +203,6 @@ func StartManager(cfg config.Config) error { } } - // Ensure NGINX is running before registering metrics & starting the manager. - if err := ngxruntime.EnsureNginxRunning(ctx); err != nil { - return fmt.Errorf("NGINX is not running: %w", err) - } - - if cfg.MetricsConfig.Enabled { - if err := configureNginxMetrics(cfg.GatewayClassName); err != nil { - return err - } - } - cfg.Logger.Info("Starting manager") return mgr.Start(ctx) } @@ -353,13 +352,29 @@ func setInitialConfig( return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter) } -func configureNginxMetrics(gatewayClassName string) error { - constLabels := map[string]string{"class": gatewayClassName} +// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them, +// and returns the runtime manager collector to be used in the nginxRuntimeMgr. +func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) { + if !metricsEnabled { + // return a no-op collector to avoid nil pointer errors when metrics are disabled + return nkgmetrics.NewManagerNoopCollector(), nil + } + constLabels := map[string]string{"class": gwClassName} + ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels) if err != nil { - return fmt.Errorf("cannot get NGINX metrics: %w", err) + return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err) + } + if err := metrics.Registry.Register(ngxCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err) } - return metrics.Registry.Register(ngxCollector) + + mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) + if err := metrics.Registry.Register(mgrCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err) + } + + return mgrCollector, nil } func getMetricsOptions(cfg config.MetricsConfig) metricsserver.Options { diff --git a/internal/mode/static/metrics/collector.go b/internal/mode/static/metrics/collector.go new file mode 100644 index 0000000000..daa454ed85 --- /dev/null +++ b/internal/mode/static/metrics/collector.go @@ -0,0 +1,115 @@ +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface +type ManagerMetricsCollector struct { + // Metrics + reloadsTotal prometheus.Counter + reloadsError prometheus.Counter + configStale prometheus.Gauge + reloadsDuration prometheus.Histogram +} + +// NewManagerMetricsCollector creates a new ManagerMetricsCollector +func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCollector { + nc := &ManagerMetricsCollector{ + reloadsTotal: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "nginx_reloads_total", + Namespace: metricsNamespace, + Help: "Number of successful NGINX reloads", + ConstLabels: constLabels, + }), + reloadsError: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "nginx_reload_errors_total", + Namespace: metricsNamespace, + Help: "Number of unsuccessful NGINX reloads", + ConstLabels: constLabels, + }, + ), + configStale: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "nginx_stale_config", + Namespace: metricsNamespace, + Help: "Indicates if NGINX is not serving the latest configuration.", + ConstLabels: constLabels, + }, + ), + reloadsDuration: prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "nginx_reloads_milliseconds", + Namespace: metricsNamespace, + Help: "Duration in milliseconds of NGINX reloads", + ConstLabels: constLabels, + Buckets: []float64{500, 1000, 5000, 10000, 30000}, + }, + ), + } + return nc +} + +// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false. +func (mc *ManagerMetricsCollector) IncReloadCount() { + mc.reloadsTotal.Inc() + mc.updateConfigStaleStatus(false) +} + +// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true. +func (mc *ManagerMetricsCollector) IncReloadErrors() { + mc.reloadsError.Inc() + mc.updateConfigStaleStatus(true) +} + +// updateConfigStaleStatus updates the last NGINX reload status metric. +func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) { + var status float64 + if stale { + status = 1.0 + } + mc.configStale.Set(status) +} + +// ObserveLastReloadTime adds the last NGINX reload time to the histogram. +func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) { + mc.reloadsDuration.Observe(float64(duration / time.Millisecond)) +} + +// Describe implements prometheus.Collector interface Describe method. +func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + mc.reloadsTotal.Describe(ch) + mc.reloadsError.Describe(ch) + mc.configStale.Describe(ch) + mc.reloadsDuration.Describe(ch) +} + +// Collect implements the prometheus.Collector interface Collect method. +func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) { + mc.reloadsTotal.Collect(ch) + mc.reloadsError.Collect(ch) + mc.configStale.Collect(ch) + mc.reloadsDuration.Collect(ch) +} + +// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface. +// Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors. +type ManagerNoopCollector struct{} + +// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface. +func NewManagerNoopCollector() *ManagerNoopCollector { + return &ManagerNoopCollector{} +} + +// IncReloadCount implements a no-op IncReloadCount. +func (mc *ManagerNoopCollector) IncReloadCount() {} + +// IncReloadErrors implements a no-op IncReloadErrors. +func (mc *ManagerNoopCollector) IncReloadErrors() {} + +// ObserveLastReloadTime implements a no-op ObserveLastReloadTime. +func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {} diff --git a/internal/mode/static/metrics/metrics.go b/internal/mode/static/metrics/metrics.go new file mode 100644 index 0000000000..1d9e85e7e6 --- /dev/null +++ b/internal/mode/static/metrics/metrics.go @@ -0,0 +1,4 @@ +package metrics + +// nolint:gosec // flagged as potential hardcoded credentials, but is not sensitive +const metricsNamespace = "nginx_kubernetes_gateway" diff --git a/internal/mode/static/metrics/nginx.go b/internal/mode/static/metrics/nginx.go index 33946722d2..c2eadd265f 100644 --- a/internal/mode/static/metrics/nginx.go +++ b/internal/mode/static/metrics/nginx.go @@ -24,7 +24,7 @@ func NewNginxMetricsCollector(constLabels map[string]string) (prometheus.Collect if err != nil { return nil, err } - return nginxCollector.NewNginxCollector(client, "nginx_kubernetes_gateway", constLabels), nil + return nginxCollector.NewNginxCollector(client, metricsNamespace, constLabels), nil } // getSocketClient gets an http.Client with a unix socket transport. diff --git a/internal/mode/static/nginx/runtime/manager.go b/internal/mode/static/nginx/runtime/manager.go index eb7f48c208..4e64fa56f1 100644 --- a/internal/mode/static/nginx/runtime/manager.go +++ b/internal/mode/static/nginx/runtime/manager.go @@ -41,19 +41,29 @@ type Manager interface { Reload(ctx context.Context, configVersion int) error } +// ManagerCollector is an interface for the metrics of the NGINX runtime manager. +type ManagerCollector interface { + IncReloadCount() + IncReloadErrors() + ObserveLastReloadTime(ms time.Duration) +} + // ManagerImpl implements Manager. type ManagerImpl struct { - verifyClient *verifyClient + verifyClient *verifyClient + managerCollector ManagerCollector } // NewManagerImpl creates a new ManagerImpl. -func NewManagerImpl() *ManagerImpl { +func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl { return &ManagerImpl{ - verifyClient: newVerifyClient(nginxReloadTimeout), + verifyClient: newVerifyClient(nginxReloadTimeout), + managerCollector: managerCollector, } } func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { + start := time.Now() // We find the main NGINX PID on every reload because it will change if the NGINX container is restarted. pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout) if err != nil { @@ -69,6 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { // send HUP signal to the NGINX main process reload configuration // See https://nginx.org/en/docs/control.html if err := syscall.Kill(pid, syscall.SIGHUP); err != nil { + m.managerCollector.IncReloadErrors() return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err) } @@ -79,10 +90,19 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { os.ReadFile, childProcsTimeout, ); err != nil { + m.managerCollector.IncReloadErrors() return fmt.Errorf(noNewWorkersErrFmt, configVersion, err) } - return m.verifyClient.waitForCorrectVersion(ctx, configVersion) + if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil { + m.managerCollector.IncReloadErrors() + return err + } + m.managerCollector.IncReloadCount() + + finish := time.Now() + m.managerCollector.ObserveLastReloadTime(finish.Sub(start)) + return nil } // EnsureNginxRunning ensures NGINX is running by locating the main process.