From d3a896c4e52343a4f363e1235b7c346b544948f8 Mon Sep 17 00:00:00 2001 From: Haibing Zhou Date: Wed, 10 Jan 2024 10:21:08 -0800 Subject: [PATCH] webhook: add options to disable resource_namespace tag in metrics To add some context, historically, `resource_name` was removed from this tag list due to its high potential of causing high metrics cardinality. See [knative/pkg#1464][1] for more information. While that's great, but it might not be sufficient for large scale use cases where namespaces can be super dynamic (with generateName, too) or grows fase enough. There is an issue report from [tektoncd/pipeline#3171][2] which talks about this. This proposal makes it possible to disable `resource_namespace` tag via an option function. The default behavior is not changed, so no user impact if any of existing users rely on this tag. There is no API contract change either due to the beauty of variadic functions. Now downstream projects can consume this by override `StatsReporter` in webhook context options with their own preference. As a caveat here, if downstream project does choose to override `StatsReporter`, the default `ReportMetrics` function shouldn't be called by default as they may now have a different set of tag keys to report. As such, this function is only called if the default `StatsReporter` is used. [1]: https://github.com/knative/pkg/pull/1464 [2]: https://github.com/tektoncd/pipeline/issues/3171 --- webhook/stats_reporter.go | 152 +++++++++++++++++++++++++-------- webhook/stats_reporter_test.go | 49 ++++++++++- 2 files changed, 160 insertions(+), 41 deletions(-) diff --git a/webhook/stats_reporter.go b/webhook/stats_reporter.go index 1fe30e8af1..5739ba6ab9 100644 --- a/webhook/stats_reporter.go +++ b/webhook/stats_reporter.go @@ -65,19 +65,89 @@ var ( resultCodeKey = tag.MustNewKey("result_code") ) +type admissionToValue func(*admissionv1.AdmissionRequest, *admissionv1.AdmissionResponse) string +type conversionToValue func(*apixv1.ConversionRequest, *apixv1.ConversionResponse) string + +var ( + // Create the tag keys that will be used to add tags to our measurements. + // Tag keys must conform to the restrictions described in + // go.opencensus.io/tag/validate.go. Currently those restrictions are: + // - length between 1 and 255 inclusive + // - characters are printable US-ASCII + admissionTags = map[tag.Key]admissionToValue{ + requestOperationKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return string(req.Operation) + }, + kindGroupKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Group + }, + kindVersionKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Version + }, + kindKindKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Kind + }, + resourceGroupKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Group + }, + resourceVersionKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Version + }, + resourceResourceKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Resource + }, + resourceNamespaceKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Namespace + }, + admissionAllowedKey: func(_ *admissionv1.AdmissionRequest, resp *admissionv1.AdmissionResponse) string { + return strconv.FormatBool(resp.Allowed) + }, + } + conversionTags = map[tag.Key]conversionToValue{ + desiredAPIVersionKey: func(req *apixv1.ConversionRequest, _ *apixv1.ConversionResponse) string { + return req.DesiredAPIVersion + }, + resultStatusKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return resp.Result.Status + }, + resultReasonKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return string(resp.Result.Reason) + }, + resultCodeKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return strconv.Itoa(int(resp.Result.Code)) + }, + } +) + // StatsReporter reports webhook metrics type StatsReporter interface { ReportAdmissionRequest(request *admissionv1.AdmissionRequest, response *admissionv1.AdmissionResponse, d time.Duration) error ReportConversionRequest(request *apixv1.ConversionRequest, response *apixv1.ConversionResponse, d time.Duration) error } +type options struct { + tagsToExclude map[string]struct{} +} + +type Option func(_ *options) + +func WithoutTag(tag string) Option { + return func(opts *options) { + if opts.tagsToExclude == nil { + opts.tagsToExclude = make(map[string]struct{}) + } + opts.tagsToExclude[tag] = struct{}{} + } +} + // reporter implements StatsReporter interface type reporter struct { - ctx context.Context + ctx context.Context + opts options } // NewStatsReporter creates a reporter for webhook metrics -func NewStatsReporter() (StatsReporter, error) { +func NewStatsReporter(opts ...Option) (StatsReporter, error) { ctx, err := tag.New( context.Background(), ) @@ -85,23 +155,26 @@ func NewStatsReporter() (StatsReporter, error) { return nil, err } - return &reporter{ctx: ctx}, nil + options := options{} + for _, opt := range opts { + opt(&options) + } + + return &reporter{ctx: ctx, opts: options}, nil } // Captures req count metric, recording the count and the duration func (r *reporter) ReportAdmissionRequest(req *admissionv1.AdmissionRequest, resp *admissionv1.AdmissionResponse, d time.Duration) error { - ctx, err := tag.New( - r.ctx, - tag.Insert(requestOperationKey, string(req.Operation)), - tag.Insert(kindGroupKey, req.Kind.Group), - tag.Insert(kindVersionKey, req.Kind.Version), - tag.Insert(kindKindKey, req.Kind.Kind), - tag.Insert(resourceGroupKey, req.Resource.Group), - tag.Insert(resourceVersionKey, req.Resource.Version), - tag.Insert(resourceResourceKey, req.Resource.Resource), - tag.Insert(resourceNamespaceKey, req.Namespace), - tag.Insert(admissionAllowedKey, strconv.FormatBool(resp.Allowed)), - ) + mutators := []tag.Mutator{} + + for key, f := range admissionTags { + if _, ok := r.opts.tagsToExclude[key.Name()]; ok { + continue + } + mutators = append(mutators, tag.Insert(key, f(req, resp))) + } + + ctx, err := tag.New(r.ctx, mutators...) if err != nil { return err } @@ -114,13 +187,16 @@ func (r *reporter) ReportAdmissionRequest(req *admissionv1.AdmissionRequest, res // Captures req count metric, recording the count and the duration func (r *reporter) ReportConversionRequest(req *apixv1.ConversionRequest, resp *apixv1.ConversionResponse, d time.Duration) error { - ctx, err := tag.New( - r.ctx, - tag.Insert(desiredAPIVersionKey, req.DesiredAPIVersion), - tag.Insert(resultStatusKey, resp.Result.Status), - tag.Insert(resultReasonKey, string(resp.Result.Reason)), - tag.Insert(resultCodeKey, strconv.Itoa(int(resp.Result.Code))), - ) + mutators := []tag.Mutator{} + + for key, f := range conversionTags { + if _, ok := r.opts.tagsToExclude[key.Name()]; ok { + continue + } + mutators = append(mutators, tag.Insert(key, f(req, resp))) + } + + ctx, err := tag.New(r.ctx, mutators...) if err != nil { return err } @@ -131,21 +207,23 @@ func (r *reporter) ReportConversionRequest(req *apixv1.ConversionRequest, resp * return nil } -func RegisterMetrics() { - tagKeys := []tag.Key{ - requestOperationKey, - kindGroupKey, - kindVersionKey, - kindKindKey, - resourceGroupKey, - resourceVersionKey, - resourceResourceKey, - resourceNamespaceKey, - admissionAllowedKey, - desiredAPIVersionKey, - resultStatusKey, - resultReasonKey, - resultCodeKey} +func RegisterMetrics(opts ...Option) { + options := options{} + for _, opt := range opts { + opt(&options) + } + + tagKeys := []tag.Key{} + for tag := range admissionTags { + if _, ok := options.tagsToExclude[tag.Name()]; !ok { + tagKeys = append(tagKeys, tag) + } + } + for tag := range conversionTags { + if _, ok := options.tagsToExclude[tag.Name()]; !ok { + tagKeys = append(tagKeys, tag) + } + } if err := view.Register( &view.View{ diff --git a/webhook/stats_reporter_test.go b/webhook/stats_reporter_test.go index e3911b4d4a..8b3f320a14 100644 --- a/webhook/stats_reporter_test.go +++ b/webhook/stats_reporter_test.go @@ -70,6 +70,47 @@ func TestWebhookStatsReporterAdmission(t *testing.T) { metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) } +func TestWebhookStatsReporterAdmissionWithoutNamespaceTag(t *testing.T) { + setup(WithoutTag(resourceNamespaceKey.Name())) + req := &admissionv1.AdmissionRequest{ + UID: "705ab4f5-6393-11e8-b7cc-42010a800002", + Kind: metav1.GroupVersionKind{Group: "autoscaling", Version: "v1", Kind: "Scale"}, + Resource: metav1.GroupVersionResource{Group: "apps", Version: "v1", Resource: "deployments"}, + Name: "my-deployment", + Namespace: "my-namespace", + Operation: admissionv1.Update, + } + + resp := &admissionv1.AdmissionResponse{ + UID: req.UID, + Allowed: true, + } + + r, _ := NewStatsReporter(WithoutTag(resourceNamespaceKey.Name())) + + shortTime, longTime := 1100.0, 9100.0 + expectedTags := map[string]string{ + requestOperationKey.Name(): string(req.Operation), + kindGroupKey.Name(): req.Kind.Group, + kindVersionKey.Name(): req.Kind.Version, + kindKindKey.Name(): req.Kind.Kind, + resourceGroupKey.Name(): req.Resource.Group, + resourceVersionKey.Name(): req.Resource.Version, + resourceResourceKey.Name(): req.Resource.Resource, + admissionAllowedKey.Name(): strconv.FormatBool(resp.Allowed), + } + + if err := r.ReportAdmissionRequest(req, resp, time.Duration(shortTime)*time.Millisecond); err != nil { + t.Fatalf("ReportAdmissionRequest() = %v", err) + } + if err := r.ReportAdmissionRequest(req, resp, time.Duration(longTime)*time.Millisecond); err != nil { + t.Fatalf("ReportAdmissionRequest() = %v", err) + } + + metricstest.CheckCountData(t, requestCountName, expectedTags, 2) + metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) +} + func TestWebhookStatsReporterConversion(t *testing.T) { setup() req := &apixv1.ConversionRequest{ @@ -103,12 +144,12 @@ func TestWebhookStatsReporterConversion(t *testing.T) { metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) } -func setup() { - resetMetrics() +func setup(opts ...Option) { + resetMetrics(opts...) } // opencensus metrics carry global state that need to be reset between unit tests -func resetMetrics() { +func resetMetrics(opts ...Option) { metricstest.Unregister(requestCountName, requestLatenciesName) - RegisterMetrics() + RegisterMetrics(opts...) }