mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -4123,8 +4123,7 @@ func TestGetPresetsBackoff(t *testing.T) {
|
||||
})
|
||||
|
||||
tmpl1 := createTemplate(t, db, orgID, userID)
|
||||
tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
_ = tmpl1V1
|
||||
createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
|
||||
backoffs, err := db.GetPresetsBackoff(ctx, now.Add(-time.Hour))
|
||||
require.NoError(t, err)
|
||||
@ -4401,6 +4400,311 @@ func TestGetPresetsBackoff(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestGetPresetsAtFailureLimit(t *testing.T) {
|
||||
t.Parallel()
|
||||
if !dbtestutil.WillUsePostgres() {
|
||||
t.SkipNow()
|
||||
}
|
||||
|
||||
now := dbtime.Now()
|
||||
hourBefore := now.Add(-time.Hour)
|
||||
orgID := uuid.New()
|
||||
userID := uuid.New()
|
||||
|
||||
findPresetByTmplVersionID := func(hardLimitedPresets []database.GetPresetsAtFailureLimitRow, tmplVersionID uuid.UUID) *database.GetPresetsAtFailureLimitRow {
|
||||
for _, preset := range hardLimitedPresets {
|
||||
if preset.TemplateVersionID == tmplVersionID {
|
||||
return &preset
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
// true - build is successful
|
||||
// false - build is unsuccessful
|
||||
buildSuccesses []bool
|
||||
hardLimit int64
|
||||
expHitHardLimit bool
|
||||
}{
|
||||
{
|
||||
name: "failed build",
|
||||
buildSuccesses: []bool{false},
|
||||
hardLimit: 1,
|
||||
expHitHardLimit: true,
|
||||
},
|
||||
{
|
||||
name: "2 failed builds",
|
||||
buildSuccesses: []bool{false, false},
|
||||
hardLimit: 1,
|
||||
expHitHardLimit: true,
|
||||
},
|
||||
{
|
||||
name: "successful build",
|
||||
buildSuccesses: []bool{true},
|
||||
hardLimit: 1,
|
||||
expHitHardLimit: false,
|
||||
},
|
||||
{
|
||||
name: "last build is failed",
|
||||
buildSuccesses: []bool{true, true, false},
|
||||
hardLimit: 1,
|
||||
expHitHardLimit: true,
|
||||
},
|
||||
{
|
||||
name: "last build is successful",
|
||||
buildSuccesses: []bool{false, false, true},
|
||||
hardLimit: 1,
|
||||
expHitHardLimit: false,
|
||||
},
|
||||
{
|
||||
name: "last 3 builds are failed - hard limit is reached",
|
||||
buildSuccesses: []bool{true, true, false, false, false},
|
||||
hardLimit: 3,
|
||||
expHitHardLimit: true,
|
||||
},
|
||||
{
|
||||
name: "1 out of 3 last build is successful - hard limit is NOT reached",
|
||||
buildSuccesses: []bool{false, false, true, false, false},
|
||||
hardLimit: 3,
|
||||
expHitHardLimit: false,
|
||||
},
|
||||
// hardLimit set to zero, implicitly disables the hard limit.
|
||||
{
|
||||
name: "despite 5 failed builds, the hard limit is not reached because it's disabled.",
|
||||
buildSuccesses: []bool{false, false, false, false, false},
|
||||
hardLimit: 0,
|
||||
expHitHardLimit: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl := createTemplate(t, db, orgID, userID)
|
||||
tmplV1 := createTmplVersionAndPreset(t, db, tmpl, tmpl.ActiveVersionID, now, nil)
|
||||
for idx, buildSuccess := range tc.buildSuccesses {
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: !buildSuccess,
|
||||
createdAt: hourBefore.Add(time.Duration(idx) * time.Second),
|
||||
})
|
||||
}
|
||||
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, tc.hardLimit)
|
||||
require.NoError(t, err)
|
||||
|
||||
if !tc.expHitHardLimit {
|
||||
require.Len(t, hardLimitedPresets, 0)
|
||||
return
|
||||
}
|
||||
|
||||
require.Len(t, hardLimitedPresets, 1)
|
||||
hardLimitedPreset := hardLimitedPresets[0]
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmplV1.preset.ID)
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("Ignore Inactive Version", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl := createTemplate(t, db, orgID, userID)
|
||||
tmplV1 := createTmplVersionAndPreset(t, db, tmpl, uuid.New(), now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
// Active Version
|
||||
tmplV2 := createTmplVersionAndPreset(t, db, tmpl, tmpl.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV2, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV2, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, hardLimitedPresets, 1)
|
||||
hardLimitedPreset := hardLimitedPresets[0]
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmplV2.preset.ID)
|
||||
})
|
||||
|
||||
t.Run("Multiple Templates", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl1 := createTemplate(t, db, orgID, userID)
|
||||
tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
tmpl2 := createTemplate(t, db, orgID, userID)
|
||||
tmpl2V1 := createTmplVersionAndPreset(t, db, tmpl2, tmpl2.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1)
|
||||
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, hardLimitedPresets, 2)
|
||||
{
|
||||
hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl1.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl1.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmpl1V1.preset.ID)
|
||||
}
|
||||
{
|
||||
hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl2.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl2.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmpl2V1.preset.ID)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("Multiple Templates, Versions and Workspace Builds", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl1 := createTemplate(t, db, orgID, userID)
|
||||
tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
tmpl2 := createTemplate(t, db, orgID, userID)
|
||||
tmpl2V1 := createTmplVersionAndPreset(t, db, tmpl2, tmpl2.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
tmpl3 := createTemplate(t, db, orgID, userID)
|
||||
tmpl3V1 := createTmplVersionAndPreset(t, db, tmpl3, uuid.New(), now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V1, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
tmpl3V2 := createTmplVersionAndPreset(t, db, tmpl3, tmpl3.ActiveVersionID, now, nil)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V2, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V2, orgID, now, &createPrebuiltWorkspaceOpts{
|
||||
failedJob: true,
|
||||
})
|
||||
|
||||
hardLimit := int64(2)
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, hardLimit)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, hardLimitedPresets, 3)
|
||||
{
|
||||
hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl1.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl1.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmpl1V1.preset.ID)
|
||||
}
|
||||
{
|
||||
hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl2.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl2.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmpl2V1.preset.ID)
|
||||
}
|
||||
{
|
||||
hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl3.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl3.ActiveVersionID)
|
||||
require.Equal(t, hardLimitedPreset.PresetID, tmpl3V2.preset.ID)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("No Workspace Builds", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl1 := createTemplate(t, db, orgID, userID)
|
||||
createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, hardLimitedPresets)
|
||||
})
|
||||
|
||||
t.Run("No Failed Workspace Builds", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
dbgen.Organization(t, db, database.Organization{
|
||||
ID: orgID,
|
||||
})
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: userID,
|
||||
})
|
||||
|
||||
tmpl1 := createTemplate(t, db, orgID, userID)
|
||||
tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil)
|
||||
successfulJobOpts := createPrebuiltWorkspaceOpts{}
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts)
|
||||
createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts)
|
||||
|
||||
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, hardLimitedPresets)
|
||||
})
|
||||
}
|
||||
|
||||
func requireUsersMatch(t testing.TB, expected []database.User, found []database.GetUsersRow, msg string) {
|
||||
t.Helper()
|
||||
require.ElementsMatch(t, expected, database.ConvertUserRows(found), msg)
|
||||
|
Reference in New Issue
Block a user