mirror of
https://github.com/coder/coder.git
synced 2025-07-13 21:36:50 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -654,6 +654,131 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
|
||||
require.Equal(t, database.WorkspaceTransitionDelete, builds[0].Transition)
|
||||
}
|
||||
|
||||
func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if !dbtestutil.WillUsePostgres() {
|
||||
t.Skip("This test requires postgres")
|
||||
}
|
||||
|
||||
// Test cases verify the behavior of prebuild creation depending on configured failure limits.
|
||||
testCases := []struct {
|
||||
name string
|
||||
hardLimit int64
|
||||
isHardLimitHit bool
|
||||
}{
|
||||
{
|
||||
name: "hard limit is hit - skip creation of prebuilt workspace",
|
||||
hardLimit: 1,
|
||||
isHardLimitHit: true,
|
||||
},
|
||||
{
|
||||
name: "hard limit is not hit - try to create prebuilt workspace again",
|
||||
hardLimit: 2,
|
||||
isHardLimitHit: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
templateDeleted := false
|
||||
|
||||
clock := quartz.NewMock(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
cfg := codersdk.PrebuildsConfig{
|
||||
FailureHardLimit: serpent.Int64(tc.hardLimit),
|
||||
ReconciliationBackoffInterval: 0,
|
||||
}
|
||||
logger := slogtest.Make(
|
||||
t, &slogtest.Options{IgnoreErrors: true},
|
||||
).Leveled(slog.LevelDebug)
|
||||
db, pubSub := dbtestutil.NewDB(t)
|
||||
fakeEnqueuer := newFakeEnqueuer()
|
||||
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer)
|
||||
|
||||
// Template admin to receive a notification.
|
||||
templateAdmin := dbgen.User(t, db, database.User{
|
||||
RBACRoles: []string{codersdk.RoleTemplateAdmin},
|
||||
})
|
||||
|
||||
// Set up test environment with a template, version, and preset.
|
||||
ownerID := uuid.New()
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: ownerID,
|
||||
})
|
||||
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
|
||||
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
|
||||
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
|
||||
|
||||
// Create a failed prebuild workspace that counts toward the hard failure limit.
|
||||
setupTestDBPrebuild(
|
||||
t,
|
||||
clock,
|
||||
db,
|
||||
pubSub,
|
||||
database.WorkspaceTransitionStart,
|
||||
database.ProvisionerJobStatusFailed,
|
||||
org.ID,
|
||||
preset,
|
||||
template.ID,
|
||||
templateVersionID,
|
||||
)
|
||||
|
||||
// Verify initial state: one failed workspace exists.
|
||||
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
workspaceCount := len(workspaces)
|
||||
require.Equal(t, 1, workspaceCount)
|
||||
|
||||
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
|
||||
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
|
||||
clock.Advance(time.Nanosecond).MustWait(ctx)
|
||||
|
||||
// Trigger reconciliation to attempt creating a new prebuild.
|
||||
// The outcome depends on whether the hard limit has been reached.
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
|
||||
// These two additional calls to ReconcileAll should not trigger any notifications.
|
||||
// A notification is only sent once.
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
require.NoError(t, controller.ReconcileAll(ctx))
|
||||
|
||||
// Verify the final state after reconciliation.
|
||||
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
updatedPreset, err := db.GetPresetByID(ctx, preset.ID)
|
||||
require.NoError(t, err)
|
||||
|
||||
if !tc.isHardLimitHit {
|
||||
// When hard limit is not reached, a new workspace should be created.
|
||||
require.Equal(t, 2, len(workspaces))
|
||||
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
|
||||
return
|
||||
}
|
||||
|
||||
// When hard limit is reached, no new workspace should be created.
|
||||
require.Equal(t, 1, len(workspaces))
|
||||
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
|
||||
|
||||
// When hard limit is reached, a notification should be sent.
|
||||
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
|
||||
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
|
||||
return false
|
||||
}
|
||||
|
||||
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
require.Len(t, matching, 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunLoop(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
Reference in New Issue
Block a user