Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions internal/controller/applicationdisruptionbudget_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/criteo/node-disruption-controller/internal/appmgrcli/disruption"
"github.com/criteo/node-disruption-controller/pkg/resolver"

openapiruntime "github.com/go-openapi/runtime"
httptransport "github.com/go-openapi/runtime/client"
"github.com/go-openapi/strfmt"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -251,13 +252,20 @@ func (r *ApplicationDisruptionBudgetResolver) CallPrepareHook(ctx context.Contex
return err
}

statusCode := 200
defer func() {
DisruptionBudgetCheckPrepareHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc()
}()
_, err = svc.PrepareApplication(&disruption.PrepareApplicationParams{
Body: r.hookBody(nd),
HTTPClient: &http.Client{Timeout: timeout},
})
if err == nil {
return nil
}
if e, ok := err.(*openapiruntime.APIError); ok {
statusCode = e.Code
}
if e, ok := err.(*disruption.PrepareApplicationStatus425); ok {
return fmt.Errorf("retry later, in %v seconds", e.RetryAfter)
}
Expand All @@ -270,13 +278,20 @@ func (r *ApplicationDisruptionBudgetResolver) CallReadyHook(ctx context.Context,
return err
}

statusCode := 200
defer func() {
DisruptionBudgetCheckReadyHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc()
}()
_, err = svc.CheckReadiness(&disruption.CheckReadinessParams{
Body: r.hookBody(nd),
HTTPClient: &http.Client{Timeout: timeout},
})
if err == nil {
return nil
}
if e, ok := err.(*openapiruntime.APIError); ok {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we check that all svc.{any_method} do return only APIError to avoid logging "200" in metrics while it should be an error code ?

statusCode = e.Code
}
if e, ok := err.(*disruption.CheckReadinessStatus425); ok {
return fmt.Errorf("retry later, in %v seconds", e.RetryAfter)
}
Expand All @@ -289,10 +304,17 @@ func (r *ApplicationDisruptionBudgetResolver) CallTerminateHook(ctx context.Cont
return err
}

statusCode := 200
defer func() {
DisruptionBudgetCheckTerminateHookStatusCodeTotal.WithLabelValues(r.ApplicationDisruptionBudget.Namespace, r.ApplicationDisruptionBudget.Name, r.ApplicationDisruptionBudget.Kind, strconv.Itoa(statusCode)).Inc()
}()
_, err = svc.TerminateDisruption(&disruption.TerminateDisruptionParams{
Body: r.hookBody(nd),
HTTPClient: &http.Client{Timeout: timeout},
})
if e, ok := err.(*openapiruntime.APIError); ok {
statusCode = e.Code
}
return err
}

Expand Down
26 changes: 24 additions & 2 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,28 @@ var (
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
// V2 Hooks
DisruptionBudgetCheckPrepareHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_prepare_hook_status_code_total",
Help: "Total number of request by HTTP status code",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
DisruptionBudgetCheckReadyHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_ready_hook_status_code_total",
Help: "Total number of request by HTTP status code",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
DisruptionBudgetCheckTerminateHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_terminate_hook_status_code_total",
Help: "Total number of request by HTTP status code",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
DisruptionBudgetCheckHealthHookErrorTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_error_total",
Expand All @@ -88,14 +110,14 @@ var (
Name: METIC_PREFIX + "disruption_budget_rejected_total",
Help: "Total number of rejected node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"},
)
DisruptionBudgetGrantedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_granted_total",
Help: "Total number of granted node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "type"},
)
DisruptionBudgetMaxDisruptions = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Expand Down
9 changes: 6 additions & 3 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con
Ok: false,
}
statuses = append(statuses, status)
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc()
break
}
impactedBudgets = append(impactedBudgets, budget)
Expand Down Expand Up @@ -551,6 +551,7 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg
if !currentStatus.Ok && !currentStatus.Preparing {
if err := budget.CallPrepareHook(ctx, ndr.NodeDisruption, ndr.Config.HealthHookTimeout); err != nil {
logger.Error(err, "failed to call prepare hook")
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc()
return nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: ref,
Reason: fmt.Sprintf("cannot prepare disruption: %s", err),
Expand All @@ -574,6 +575,8 @@ func (ndr *SingleNodeDisruptionReconciler) v2HookCheck(ctx context.Context, budg
}
}
}

DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc()
return nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: ref,
Ok: true,
Expand All @@ -590,10 +593,10 @@ func (ndr *SingleNodeDisruptionReconciler) legacyHookCheck(ctx context.Context,
Reason: fmt.Sprintf("Unhealthy: %s", err),
Ok: false,
}
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc()
return status
}
DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, ndr.NodeDisruption.Spec.Type).Inc()
return nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: budget.GetNamespacedName(),
Reason: "",
Expand Down
Loading