fix: reduce cost of prebuild failure (#17697)

Relates to https://github.com/coder/coder/issues/17432

### Part 1:

Notes:
- `GetPresetsAtFailureLimit` SQL query is added, which is similar to
`GetPresetsBackoff`, they use same CTEs: `filtered_builds`,
`time_sorted_builds`, but they are still different.

- Query is executed on every loop iteration. We can consider marking
specific preset as permanently failed as an optimization to avoid
executing query on every loop iteration. But I decided don't do it for
now.

- By default `FailureHardLimit` is set to 3.

- `FailureHardLimit` is configurable. Setting it to zero - means that
hard limit is disabled.

### Part 2

Notes:
- `PrebuildFailureLimitReached` notification is added.
- Notification is sent to template admins.
- Notification is sent only the first time, when hard limit is reached.
But it will `log.Warn` on every loop iteration.
- I introduced this enum:
```sql
CREATE TYPE prebuild_status AS ENUM (
  'normal',           -- Prebuilds are working as expected; this is the default, healthy state.
  'hard_limited',     -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore.
  'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried.
);
```
`validation_failed` not used in this PR, but I think it will be used in
next one, so I wanted to save us an extra migration.

- Notification looks like this:
<img width="472" alt="image"
src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27"
/>

### Latest notification views:
<img width="463" alt="image"
src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe"
/>
<img width="725" alt="image"
src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a"
/>
This commit is contained in:
Yevhenii Shcherbina
2025-05-21 15:16:38 -04:00
committed by GitHub
parent e1934fe119
commit 53e8e9c7cd
32 changed files with 1160 additions and 60 deletions

View File

@ -313,6 +313,7 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor
if len(presetsWithPrebuilds) == 0 {
return nil
}
allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
if err != nil {
return xerrors.Errorf("failed to get running prebuilds: %w", err)
@ -328,7 +329,18 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
}
state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff)
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, c.cfg.FailureHardLimit.Value())
if err != nil {
return xerrors.Errorf("failed to get hard limited presets: %w", err)
}
state = prebuilds.NewGlobalSnapshot(
presetsWithPrebuilds,
allRunningPrebuilds,
allPrebuildsInProgress,
presetsBackoff,
hardLimitedPresets,
)
return nil
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
@ -349,19 +361,45 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
return nil
}
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
if ps.IsHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
Status: database.PrebuildStatusHardLimited,
PresetID: ps.Preset.ID,
})
if err != nil {
return xerrors.Errorf("failed to update preset prebuild status: %w", err)
}
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
return nil
}
return nil
}
state := ps.CalculateState()
actions, err := c.CalculateActions(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID))
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err))
return nil
}
// Nothing has to be done.
if !ps.Preset.UsingActiveVersion && actions.IsNoop() {
logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done",
slog.F("template_id", ps.Preset.TemplateID.String()), slog.F("template_name", ps.Preset.TemplateName),
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()), slog.F("template_version_name", ps.Preset.TemplateVersionName),
slog.F("preset_id", ps.Preset.ID.String()), slog.F("preset_name", ps.Preset.Name))
logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done")
return nil
}
@ -442,6 +480,49 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
}
}
func (c *StoreReconciler) notifyPrebuildFailureLimitReached(ctx context.Context, ps prebuilds.PresetSnapshot) error {
// nolint:gocritic // Necessary to query all the required data.
ctx = dbauthz.AsSystemRestricted(ctx)
// Send notification to template admins.
if c.notifEnq == nil {
c.logger.Warn(ctx, "notification enqueuer not set, cannot send prebuild is hard limited notification(s)")
return nil
}
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin},
})
if err != nil {
return xerrors.Errorf("fetch template admins: %w", err)
}
for _, templateAdmin := range templateAdmins {
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached,
map[string]string{
"org": ps.Preset.OrganizationName,
"template": ps.Preset.TemplateName,
"template_version": ps.Preset.TemplateVersionName,
"preset": ps.Preset.Name,
},
map[string]any{},
"prebuilds_reconciler",
// Associate this notification with all the related entities.
ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID, ps.Preset.OrganizationID,
); err != nil {
c.logger.Error(ctx,
"failed to send notification",
slog.Error(err),
slog.F("template_admin_id", templateAdmin.ID.String()),
)
continue
}
}
return nil
}
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) {
if ctx.Err() != nil {
return nil, ctx.Err()

View File

@ -654,6 +654,131 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
require.Equal(t, database.WorkspaceTransitionDelete, builds[0].Transition)
}
func TestSkippingHardLimitedPresets(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("This test requires postgres")
}
// Test cases verify the behavior of prebuild creation depending on configured failure limits.
testCases := []struct {
name string
hardLimit int64
isHardLimitHit bool
}{
{
name: "hard limit is hit - skip creation of prebuilt workspace",
hardLimit: 1,
isHardLimitHit: true,
},
{
name: "hard limit is not hit - try to create prebuilt workspace again",
hardLimit: 2,
isHardLimitHit: false,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
templateDeleted := false
clock := quartz.NewMock(t)
ctx := testutil.Context(t, testutil.WaitShort)
cfg := codersdk.PrebuildsConfig{
FailureHardLimit: serpent.Int64(tc.hardLimit),
ReconciliationBackoffInterval: 0,
}
logger := slogtest.Make(
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
fakeEnqueuer := newFakeEnqueuer()
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
RBACRoles: []string{codersdk.RoleTemplateAdmin},
})
// Set up test environment with a template, version, and preset.
ownerID := uuid.New()
dbgen.User(t, db, database.User{
ID: ownerID,
})
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
// Create a failed prebuild workspace that counts toward the hard failure limit.
setupTestDBPrebuild(
t,
clock,
db,
pubSub,
database.WorkspaceTransitionStart,
database.ProvisionerJobStatusFailed,
org.ID,
preset,
template.ID,
templateVersionID,
)
// Verify initial state: one failed workspace exists.
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
workspaceCount := len(workspaces)
require.Equal(t, 1, workspaceCount)
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
clock.Advance(time.Nanosecond).MustWait(ctx)
// Trigger reconciliation to attempt creating a new prebuild.
// The outcome depends on whether the hard limit has been reached.
require.NoError(t, controller.ReconcileAll(ctx))
// These two additional calls to ReconcileAll should not trigger any notifications.
// A notification is only sent once.
require.NoError(t, controller.ReconcileAll(ctx))
require.NoError(t, controller.ReconcileAll(ctx))
// Verify the final state after reconciliation.
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
updatedPreset, err := db.GetPresetByID(ctx, preset.ID)
require.NoError(t, err)
if !tc.isHardLimitHit {
// When hard limit is not reached, a new workspace should be created.
require.Equal(t, 2, len(workspaces))
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
return
}
// When hard limit is reached, no new workspace should be created.
require.Equal(t, 1, len(workspaces))
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
// When hard limit is reached, a notification should be sent.
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
return false
}
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
return false
}
return true
})
require.Len(t, matching, 1)
})
}
}
func TestRunLoop(t *testing.T) {
t.Parallel()