diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index b54976a..58b6f3b 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -33,6 +33,7 @@ import ( "github.com/criteo/node-disruption-controller/internal/appmgrcli/disruption" "github.com/criteo/node-disruption-controller/pkg/resolver" + openapiruntime "github.com/go-openapi/runtime" httptransport "github.com/go-openapi/runtime/client" "github.com/go-openapi/strfmt" "github.com/prometheus/client_golang/prometheus" @@ -251,6 +252,10 @@ func (r *ApplicationDisruptionBudgetResolver) CallPrepareHook(ctx context.Contex return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckPrepareHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.PrepareApplication(&disruption.PrepareApplicationParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, @@ -258,6 +263,9 @@ func (r *ApplicationDisruptionBudgetResolver) CallPrepareHook(ctx context.Contex if err == nil { return nil } + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } if e, ok := err.(*disruption.PrepareApplicationStatus425); ok { return fmt.Errorf("retry later, in %v seconds", e.RetryAfter) } @@ -270,6 +278,10 @@ func (r *ApplicationDisruptionBudgetResolver) CallReadyHook(ctx context.Context, return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckReadyHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.CheckReadiness(&disruption.CheckReadinessParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, @@ -277,6 +289,9 @@ func (r *ApplicationDisruptionBudgetResolver) CallReadyHook(ctx context.Context, if err == nil { return nil } + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } if e, ok := err.(*disruption.CheckReadinessStatus425); ok { return fmt.Errorf("retry later, in %v seconds", e.RetryAfter) } @@ -289,10 +304,17 @@ func (r *ApplicationDisruptionBudgetResolver) CallTerminateHook(ctx context.Cont return err } + statusCode := 200 + defer func() { + DisruptionBudgetCheckTerminateHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc() + }() _, err = svc.TerminateDisruption(&disruption.TerminateDisruptionParams{ Body: r.hookBody(nd), HTTPClient: &http.Client{Timeout: timeout}, }) + if e, ok := err.(*openapiruntime.APIError); ok { + statusCode = e.Code + } return err } diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 5227de2..071043b 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -76,6 +76,28 @@ var ( }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, ) + // V2 Hooks + DisruptionBudgetCheckPrepareHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_prepare_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) + DisruptionBudgetCheckReadyHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_ready_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) + DisruptionBudgetCheckTerminateHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_terminate_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) DisruptionBudgetCheckHealthHookErrorTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_health_hook_error_total", @@ -88,14 +110,14 @@ var ( Name: METIC_PREFIX + "disruption_budget_rejected_total", Help: "Total number of rejected node disruption by the disruption budget", }, - []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"}, ) DisruptionBudgetGrantedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_granted_total", Help: "Total number of granted node disruption by the disruption budget", }, - []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"}, ) DisruptionBudgetMaxDisruptions = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index eb807d3..9b291c5 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -508,7 +508,7 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con Ok: false, } statuses = append(statuses, status) - DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() break } impactedBudgets = append(impactedBudgets, budget) @@ -551,6 +551,7 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg if !currentStatus.Ok && !currentStatus.Preparing { if err := budget.CallPrepareHook(ctx, ndr.NodeDisruption, ndr.Config.HealthHookTimeout); err != nil { logger.Error(err, "failed to call prepare hook") + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: ref, Reason: fmt.Sprintf("cannot prepare disruption: %s", err), @@ -574,6 +575,8 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg } } } + + DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: ref, Ok: true, @@ -590,10 +593,10 @@ func (ndr *SingleNodeDisruptionReconciler) legacyHookCheck(ctx context.Context, Reason: fmt.Sprintf("Unhealthy: %s", err), Ok: false, } - DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return status } - DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc() return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: budget.GetNamespacedName(), Reason: "",