From 1b329273e72bb998cab42650d14e47a8e39cb91c Mon Sep 17 00:00:00 2001 From: Jeremy Clerc Date: Wed, 10 Jun 2026 09:13:02 +0200 Subject: [PATCH] Add metrics for v2 hooks, add type to rejected/granted metrics In a similar fashion to what we have for v1, add some metrics about ADB v2 hooks status codes, so we can track if some v2 hooks do not answer as expected. Add the type of the NodeDisruption to disruption budget grant and reject metrics to help understand if rejections are specific to a type. --- .../applicationdisruptionbudget_controller.go | 22 ++++++++++++++++ internal/controller/metrics.go | 26 +++++++++++++++++-- .../controller/nodedisruption_controller.go | 9 ++++--- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index b54976a..58b6f3b 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -33,6 +33,7 @@ import ( "github.com/criteo/node-disruption-controller/internal/appmgrcli/disruption" "github.com/criteo/node-disruption-controller/pkg/resolver" + openapiruntime "github.com/go-openapi/runtime" httptransport "github.com/go-openapi/runtime/client" "github.com/go-openapi/strfmt" "github.com/prometheus/client_golang/prometheus" @@ -251,6 +252,10 @@ func (r *ApplicationDisruptionBudgetResolver) CallPrepareHook(ctx context.Contex return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckPrepareHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.PrepareApplication(&disruption.PrepareApplicationParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, @@ -258,6 +263,9 @@ func (r *ApplicationDisruptionBudgetResolver) CallPrepareHook(ctx context.Contex if err == nil { return nil } + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } if e, ok := err.(*disruption.PrepareApplicationStatus425); ok { return fmt.Errorf("retry later, in %v seconds", e.RetryAfter) } @@ -270,6 +278,10 @@ func (r *ApplicationDisruptionBudgetResolver) CallReadyHook(ctx context.Context, return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckReadyHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.CheckReadiness(&disruption.CheckReadinessParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, @@ -277,6 +289,9 @@ func (r *ApplicationDisruptionBudgetResolver) CallReadyHook(ctx context.Context, if err == nil { return nil } + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } if e, ok := err.(*disruption.CheckReadinessStatus425); ok { return fmt.Errorf("retry later, in %v seconds", e.RetryAfter) } @@ -289,10 +304,17 @@ func (r *ApplicationDisruptionBudgetResolver) CallTerminateHook(ctx context.Cont return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckTerminateHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.TerminateDisruption(&disruption.TerminateDisruptionParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, }) + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } return err } diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 5227de2..071043b 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -76,6 +76,28 @@ var ( }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, ) + // V2 Hooks + DisruptionBudgetCheckPrepareHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_prepare_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) + DisruptionBudgetCheckReadyHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_ready_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) + DisruptionBudgetCheckTerminateHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_terminate_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) DisruptionBudgetCheckHealthHookErrorTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_health_hook_error_total", @@ -88,14 +110,14 @@ var ( Name: METIC_PREFIX + "disruption_budget_rejected_total", Help: "Total number of rejected node disruption by the disruption budget", }, - []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"}, ) DisruptionBudgetGrantedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_granted_total", Help: "Total number of granted node disruption by the disruption budget", }, - []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"}, ) DisruptionBudgetMaxDisruptions = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index eb807d3..9b291c5 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -508,7 +508,7 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con Ok: false, } statuses = append(statuses, status) - DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() break } impactedBudgets = append(impactedBudgets, budget) @@ -551,6 +551,7 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg if !currentStatus.Ok && !currentStatus.Preparing { if err := budget.CallPrepareHook(ctx, ndr.NodeDisruption, ndr.Config.HealthHookTimeout); err != nil { logger.Error(err, "failed to call prepare hook") + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: ref, Reason: fmt.Sprintf("cannot prepare disruption: %s", err), @@ -574,6 +575,8 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg } } } + + DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: ref, Ok: true, @@ -590,10 +593,10 @@ func (ndr *SingleNodeDisruptionReconciler) legacyHookCheck(ctx context.Context, Reason: fmt.Sprintf("Unhealthy: %s", err), Ok: false, } - DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return status } - DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: budget.GetNamespacedName(), Reason: "",