mirror of
https://github.com/coder/coder.git
synced 2025-07-12 00:14:10 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -313,6 +313,7 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor
|
||||
if len(presetsWithPrebuilds) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get running prebuilds: %w", err)
|
||||
@ -328,7 +329,18 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor
|
||||
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
|
||||
}
|
||||
|
||||
state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff)
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, c.cfg.FailureHardLimit.Value())
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get hard limited presets: %w", err)
|
||||
}
|
||||
|
||||
state = prebuilds.NewGlobalSnapshot(
|
||||
presetsWithPrebuilds,
|
||||
allRunningPrebuilds,
|
||||
allPrebuildsInProgress,
|
||||
presetsBackoff,
|
||||
hardLimitedPresets,
|
||||
)
|
||||
return nil
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
|
||||
@ -349,19 +361,45 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
|
||||
slog.F("preset_name", ps.Preset.Name),
|
||||
)
|
||||
|
||||
// If the preset was previously hard-limited, log it and exit early.
|
||||
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
|
||||
logger.Warn(ctx, "skipping hard limited preset")
|
||||
return nil
|
||||
}
|
||||
|
||||
// If the preset reached the hard failure limit for the first time during this iteration:
|
||||
// - Mark it as hard-limited in the database
|
||||
// - Send notifications to template admins
|
||||
if ps.IsHardLimited {
|
||||
logger.Warn(ctx, "skipping hard limited preset")
|
||||
|
||||
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
|
||||
Status: database.PrebuildStatusHardLimited,
|
||||
PresetID: ps.Preset.ID,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to update preset prebuild status: %w", err)
|
||||
}
|
||||
|
||||
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
|
||||
if err != nil {
|
||||
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := c.CalculateActions(ctx, ps)
|
||||
if err != nil {
|
||||
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID))
|
||||
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Nothing has to be done.
|
||||
if !ps.Preset.UsingActiveVersion && actions.IsNoop() {
|
||||
logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done",
|
||||
slog.F("template_id", ps.Preset.TemplateID.String()), slog.F("template_name", ps.Preset.TemplateName),
|
||||
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()), slog.F("template_version_name", ps.Preset.TemplateVersionName),
|
||||
slog.F("preset_id", ps.Preset.ID.String()), slog.F("preset_name", ps.Preset.Name))
|
||||
logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done")
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -442,6 +480,49 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) notifyPrebuildFailureLimitReached(ctx context.Context, ps prebuilds.PresetSnapshot) error {
|
||||
// nolint:gocritic // Necessary to query all the required data.
|
||||
ctx = dbauthz.AsSystemRestricted(ctx)
|
||||
|
||||
// Send notification to template admins.
|
||||
if c.notifEnq == nil {
|
||||
c.logger.Warn(ctx, "notification enqueuer not set, cannot send prebuild is hard limited notification(s)")
|
||||
return nil
|
||||
}
|
||||
|
||||
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
|
||||
RbacRole: []string{codersdk.RoleTemplateAdmin},
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("fetch template admins: %w", err)
|
||||
}
|
||||
|
||||
for _, templateAdmin := range templateAdmins {
|
||||
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached,
|
||||
map[string]string{
|
||||
"org": ps.Preset.OrganizationName,
|
||||
"template": ps.Preset.TemplateName,
|
||||
"template_version": ps.Preset.TemplateVersionName,
|
||||
"preset": ps.Preset.Name,
|
||||
},
|
||||
map[string]any{},
|
||||
"prebuilds_reconciler",
|
||||
// Associate this notification with all the related entities.
|
||||
ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID, ps.Preset.OrganizationID,
|
||||
); err != nil {
|
||||
c.logger.Error(ctx,
|
||||
"failed to send notification",
|
||||
slog.Error(err),
|
||||
slog.F("template_admin_id", templateAdmin.ID.String()),
|
||||
)
|
||||
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) {
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
|
@ -654,6 +654,131 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
|
||||
require.Equal(t, database.WorkspaceTransitionDelete, builds[0].Transition)
|
||||
}
|
||||
|
||||
func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if !dbtestutil.WillUsePostgres() {
|
||||
t.Skip("This test requires postgres")
|
||||
}
|
||||
|
||||
// Test cases verify the behavior of prebuild creation depending on configured failure limits.
|
||||
testCases := []struct {
|
||||
name string
|
||||
hardLimit int64
|
||||
isHardLimitHit bool
|
||||
}{
|
||||
{
|
||||
name: "hard limit is hit - skip creation of prebuilt workspace",
|
||||
hardLimit: 1,
|
||||
isHardLimitHit: true,
|
||||
},
|
||||
{
|
||||
name: "hard limit is not hit - try to create prebuilt workspace again",
|
||||
hardLimit: 2,
|
||||
isHardLimitHit: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
templateDeleted := false
|
||||
|
||||
clock := quartz.NewMock(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
cfg := codersdk.PrebuildsConfig{
|
||||
FailureHardLimit: serpent.Int64(tc.hardLimit),
|
||||
ReconciliationBackoffInterval: 0,
|
||||
}
|
||||
logger := slogtest.Make(
|
||||
t, &slogtest.Options{IgnoreErrors: true},
|
||||
).Leveled(slog.LevelDebug)
|
||||
db, pubSub := dbtestutil.NewDB(t)
|
||||
fakeEnqueuer := newFakeEnqueuer()
|
||||
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer)
|
||||
|
||||
// Template admin to receive a notification.
|
||||
templateAdmin := dbgen.User(t, db, database.User{
|
||||
RBACRoles: []string{codersdk.RoleTemplateAdmin},
|
||||
})
|
||||
|
||||
// Set up test environment with a template, version, and preset.
|
||||
ownerID := uuid.New()
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: ownerID,
|
||||
})
|
||||
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
|
||||
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
|
||||
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
|
||||
|
||||
// Create a failed prebuild workspace that counts toward the hard failure limit.
|
||||
setupTestDBPrebuild(
|
||||
t,
|
||||
clock,
|
||||
db,
|
||||
pubSub,
|
||||
database.WorkspaceTransitionStart,
|
||||
database.ProvisionerJobStatusFailed,
|
||||
org.ID,
|
||||
preset,
|
||||
template.ID,
|
||||
templateVersionID,
|
||||
)
|
||||
|
||||
// Verify initial state: one failed workspace exists.
|
||||
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
workspaceCount := len(workspaces)
|
||||
require.Equal(t, 1, workspaceCount)
|
||||
|
||||
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
|
||||
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
|
||||
clock.Advance(time.Nanosecond).MustWait(ctx)
|
||||
|
||||
// Trigger reconciliation to attempt creating a new prebuild.
|
||||
// The outcome depends on whether the hard limit has been reached.
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
|
||||
// These two additional calls to ReconcileAll should not trigger any notifications.
|
||||
// A notification is only sent once.
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
|
||||
// Verify the final state after reconciliation.
|
||||
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
updatedPreset, err := db.GetPresetByID(ctx, preset.ID)
|
||||
require.NoError(t, err)
|
||||
|
||||
if !tc.isHardLimitHit {
|
||||
// When hard limit is not reached, a new workspace should be created.
|
||||
require.Equal(t, 2, len(workspaces))
|
||||
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
|
||||
return
|
||||
}
|
||||
|
||||
// When hard limit is reached, no new workspace should be created.
|
||||
require.Equal(t, 1, len(workspaces))
|
||||
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
|
||||
|
||||
// When hard limit is reached, a notification should be sent.
|
||||
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
|
||||
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
|
||||
return false
|
||||
}
|
||||
|
||||
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
require.Len(t, matching, 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunLoop(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
Reference in New Issue
Block a user