diff --git a/pkg/service/healthcheck/metrics_test.go b/pkg/service/healthcheck/metrics_test.go index 67bad84ab..5bf6dc7d2 100644 --- a/pkg/service/healthcheck/metrics_test.go +++ b/pkg/service/healthcheck/metrics_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/prometheus/client_golang/prometheus" + "github.com/scylladb/go-log" "github.com/scylladb/scylla-manager/v3/pkg/util/uuid" ) @@ -27,10 +28,13 @@ func TestRemoveClusterMetricsWhenNumberOfMetricsExceedsDefaultChannelLength_2843 } metric.With(hl).Set(1) } - r := runner{metrics: &runnerMetrics{ - status: metric, - rtt: metric, - }} + r := runner{ + logger: log.NewDevelopment(), + metrics: &runnerMetrics{ + status: metric, + rtt: metric, + }, + } r.removeMetricsForCluster(clusterID) } diff --git a/pkg/service/healthcheck/runner.go b/pkg/service/healthcheck/runner.go index efe45c3ca..4a5a166af 100644 --- a/pkg/service/healthcheck/runner.go +++ b/pkg/service/healthcheck/runner.go @@ -9,6 +9,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/scylladb/go-log" "github.com/scylladb/go-set/strset" "github.com/scylladb/scylla-manager/v3/pkg/scyllaclient" @@ -43,6 +44,7 @@ func (r Runner) Run(ctx context.Context, clusterID, taskID, runID uuid.UUID, pro } type runner struct { + logger log.Logger scyllaClient scyllaclient.ProviderFunc timeout time.Duration metrics *runnerMetrics @@ -91,13 +93,7 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, status []sc rtt, err := r.ping(ctx, clusterID, status[i].Addr, r.timeout) if err != nil { - // Set -2 for unavailable agent and -1 for unavailable Scylla - _, err := r.pingAgent(ctx, clusterID, status[i].Addr, r.timeout) - if err != nil { - r.metrics.status.With(hl).Set(-2) - } else { - r.metrics.status.With(hl).Set(-1) - } + r.metrics.status.With(hl).Set(-1) } else { r.metrics.status.With(hl).Set(1) } @@ -106,7 +102,9 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, status []sc return nil } - _ = parallel.Run(len(status), parallel.NoLimit, f, parallel.NopNotify) // nolint: errcheck + _ = parallel.Run(len(status), parallel.NoLimit, f, func(i int, err error) { // nolint: errcheck + r.logger.Error(ctx, "Parallel hosts check failed", "", status[i].Addr, "error", err) + }) } func (r runner) removeMetricsForCluster(clusterID uuid.UUID) { diff --git a/pkg/service/healthcheck/service.go b/pkg/service/healthcheck/service.go index 2cbdf4907..be9acee68 100644 --- a/pkg/service/healthcheck/service.go +++ b/pkg/service/healthcheck/service.go @@ -56,6 +56,7 @@ func NewService(config Config, scyllaClient scyllaclient.ProviderFunc, secretsSt func (s *Service) Runner() Runner { return Runner{ cql: runner{ + logger: s.logger.Named("CQL healthcheck"), scyllaClient: s.scyllaClient, timeout: s.config.MaxTimeout, metrics: &runnerMetrics{ @@ -66,6 +67,7 @@ func (s *Service) Runner() Runner { pingAgent: s.pingAgent, }, rest: runner{ + logger: s.logger.Named("REST healthcheck"), scyllaClient: s.scyllaClient, timeout: s.config.MaxTimeout, metrics: &runnerMetrics{ @@ -76,6 +78,7 @@ func (s *Service) Runner() Runner { pingAgent: s.pingAgent, }, alternator: runner{ + logger: s.logger.Named("Alternator healthcheck"), scyllaClient: s.scyllaClient, timeout: s.config.MaxTimeout, metrics: &runnerMetrics{