feat: add hard-limited presets metric (#18008)

Closes https://github.com/coder/coder/issues/17988

Define `preset_hard_limited` metric which for every preset indicates
whether a given preset has reached the hard failure limit (1 for
hard-limited, 0 otherwise).

CLI example:
```
curl -X GET localhost:2118/metrics | grep preset_hard_limited
# HELP coderd_prebuilt_workspaces_preset_hard_limited Indicates whether a given preset has reached the hard failure limit (1 for hard-limited, 0 otherwise).
# TYPE coderd_prebuilt_workspaces_preset_hard_limited gauge
coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="GoLand: Large",template_name="Test7"} 1
coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="GoLand: Large",template_name="ValidTemplate"} 0
coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="IU: Medium",template_name="Test7"} 1
coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="IU: Medium",template_name="ValidTemplate"} 0
coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="WS: Small",template_name="Test7"} 1
```

NOTE:
```go
if !ps.Preset.Deleted && ps.Preset.UsingActiveVersion {
	c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
}
```

Only active template version is tracked. If admin creates new template
version - old value of metric (for previous template version) will be
overwritten with new value of metric (for active template version).
Because `template_version` is not part of metric:
```go
labels = []string{"template_name", "preset_name", "organization_name"}
```

Implementation is similar to implementation of
`MetricResourceReplacementsCount` metric

---------

Co-authored-by: Susana Ferreira <ssncferreira@gmail.com>
This commit is contained in:
Yevhenii Shcherbina
2025-05-26 11:39:44 -04:00
committed by GitHub
parent 0731304905
commit 2a15aa8a6f
3 changed files with 334 additions and 11 deletions

View File

@ -27,6 +27,7 @@ const (
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
)
@ -82,6 +83,12 @@ var (
labels,
nil,
)
presetHardLimitedDesc = prometheus.NewDesc(
MetricPresetHardLimitedGauge,
"Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.",
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
@ -104,17 +111,22 @@ type MetricsCollector struct {
replacementsCounter map[replacementKey]float64
replacementsCounterMu sync.Mutex
isPresetHardLimited map[hardLimitedPresetKey]bool
isPresetHardLimitedMu sync.Mutex
}
var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
}
}
@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- presetHardLimitedDesc
descCh <- lastUpdateDesc
}
@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}
mc.isPresetHardLimitedMu.Lock()
for key, isHardLimited := range mc.isPresetHardLimited {
var val float64
if isHardLimited {
val = 1
}
metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
}
mc.isPresetHardLimitedMu.Unlock()
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}
@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key]++
}
type hardLimitedPresetKey struct {
orgName, templateName, presetName string
}
func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
if isHardLimited {
mc.isPresetHardLimited[key] = true
} else {
delete(mc.isPresetHardLimited, key)
}
}

View File

@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
return nil
}
// Report a preset as hard-limited only if all the following conditions are met:
// - The preset is marked as hard-limited
// - The preset is using the active version of its template, and the template has not been deleted
//
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
// as hard-limited to the admin.
reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited)
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
if ps.IsHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
logger.Warn(ctx, "preset is hard limited, notifying template admins")
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
Status: database.PrebuildStatusHardLimited,
@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
return nil
}
return nil
}
state := ps.CalculateState()
@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
actions.Create = desired
}
// If preset is hard-limited, and it's a create operation, log it and exit early.
// Creation operation is disallowed for hard-limited preset.
if ps.IsHardLimited && actions.Create > 0 {
logger.Warn(ctx, "skipping hard limited preset for create operation")
return nil
}
var multiErr multierror.Error
for range actions.Create {

View File

@ -696,7 +696,8 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
fakeEnqueuer := newFakeEnqueuer()
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer)
registry := prometheus.NewRegistry()
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
@ -732,6 +733,17 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
workspaceCount := len(workspaces)
require.Equal(t, 1, workspaceCount)
// Verify initial state: metric is not set - meaning preset is not hard limited.
require.NoError(t, controller.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.Nil(t, metric)
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
clock.Advance(time.Nanosecond).MustWait(ctx)
@ -755,6 +767,16 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
// When hard limit is not reached, a new workspace should be created.
require.Equal(t, 2, len(workspaces))
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
// When hard limit is not reached, metric is not set.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.Nil(t, metric)
return
}
@ -775,6 +797,252 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
return true
})
require.Len(t, matching, 1)
// When hard limit is reached, metric is set to 1.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue())
})
}
}
func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("This test requires postgres")
}
// Test cases verify the behavior of prebuild creation depending on configured failure limits.
testCases := []struct {
name string
hardLimit int64
createNewTemplateVersion bool
deleteTemplate bool
}{
{
// hard limit is hit - but we allow deletion of prebuilt workspace because it's outdated (new template version was created)
name: "new template version is created",
hardLimit: 1,
createNewTemplateVersion: true,
deleteTemplate: false,
},
{
// hard limit is hit - but we allow deletion of prebuilt workspace because template is deleted
name: "template is deleted",
hardLimit: 1,
createNewTemplateVersion: false,
deleteTemplate: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
clock := quartz.NewMock(t)
ctx := testutil.Context(t, testutil.WaitShort)
cfg := codersdk.PrebuildsConfig{
FailureHardLimit: serpent.Int64(tc.hardLimit),
ReconciliationBackoffInterval: 0,
}
logger := slogtest.Make(
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
fakeEnqueuer := newFakeEnqueuer()
registry := prometheus.NewRegistry()
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
RBACRoles: []string{codersdk.RoleTemplateAdmin},
})
// Set up test environment with a template, version, and preset.
ownerID := uuid.New()
dbgen.User(t, db, database.User{
ID: ownerID,
})
org, template := setupTestDBTemplate(t, db, ownerID, false)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 2, uuid.New().String())
// Create a successful prebuilt workspace.
successfulWorkspace, _ := setupTestDBPrebuild(
t,
clock,
db,
pubSub,
database.WorkspaceTransitionStart,
database.ProvisionerJobStatusSucceeded,
org.ID,
preset,
template.ID,
templateVersionID,
)
// Make sure that prebuilt workspaces created in such order: [successful, failed].
clock.Advance(time.Second).MustWait(ctx)
// Create a failed prebuilt workspace that counts toward the hard failure limit.
setupTestDBPrebuild(
t,
clock,
db,
pubSub,
database.WorkspaceTransitionStart,
database.ProvisionerJobStatusFailed,
org.ID,
preset,
template.ID,
templateVersionID,
)
getJobStatusMap := func(workspaces []database.WorkspaceTable) map[database.ProvisionerJobStatus]int {
jobStatusMap := make(map[database.ProvisionerJobStatus]int)
for _, workspace := range workspaces {
workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{
WorkspaceID: workspace.ID,
})
require.NoError(t, err)
for _, workspaceBuild := range workspaceBuilds {
job, err := db.GetProvisionerJobByID(ctx, workspaceBuild.JobID)
require.NoError(t, err)
jobStatusMap[job.JobStatus]++
}
}
return jobStatusMap
}
// Verify initial state: two workspaces exist, one successful, one failed.
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
require.Equal(t, 2, len(workspaces))
jobStatusMap := getJobStatusMap(workspaces)
require.Len(t, jobStatusMap, 2)
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
// Verify initial state: metric is not set - meaning preset is not hard limited.
require.NoError(t, controller.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.Nil(t, metric)
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
clock.Advance(time.Nanosecond).MustWait(ctx)
// Trigger reconciliation to attempt creating a new prebuild.
// The outcome depends on whether the hard limit has been reached.
require.NoError(t, controller.ReconcileAll(ctx))
// These two additional calls to ReconcileAll should not trigger any notifications.
// A notification is only sent once.
require.NoError(t, controller.ReconcileAll(ctx))
require.NoError(t, controller.ReconcileAll(ctx))
// Verify the final state after reconciliation.
// When hard limit is reached, no new workspace should be created.
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
require.Equal(t, 2, len(workspaces))
jobStatusMap = getJobStatusMap(workspaces)
require.Len(t, jobStatusMap, 2)
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
updatedPreset, err := db.GetPresetByID(ctx, preset.ID)
require.NoError(t, err)
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
// When hard limit is reached, a notification should be sent.
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
return false
}
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
return false
}
return true
})
require.Len(t, matching, 1)
// When hard limit is reached, metric is set to 1.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue())
if tc.createNewTemplateVersion {
// Create a new template version and mark it as active
// This marks the template version that we care about as inactive
setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
}
if tc.deleteTemplate {
require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
ID: template.ID,
Deleted: true,
UpdatedAt: dbtime.Now(),
}))
}
// Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted.
require.NoError(t, controller.ReconcileAll(ctx))
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
require.Equal(t, 2, len(workspaces))
jobStatusMap = getJobStatusMap(workspaces)
require.Len(t, jobStatusMap, 3)
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
// Pending job should be the job that deletes successful, but outdated prebuilt workspace.
// Prebuilt workspace MUST be deleted, despite the fact that preset is marked as hard limited.
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusPending])
workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{
WorkspaceID: successfulWorkspace.ID,
})
require.NoError(t, err)
require.Equal(t, 2, len(workspaceBuilds))
// Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion.
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
// Metric is deleted after preset became outdated.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.Nil(t, metric)
})
}
}