mirror of
https://github.com/coder/coder.git
synced 2025-07-12 00:14:10 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -14,6 +14,7 @@ type GlobalSnapshot struct {
|
||||
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
|
||||
PrebuildsInProgress []database.CountInProgressPrebuildsRow
|
||||
Backoffs []database.GetPresetsBackoffRow
|
||||
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
|
||||
}
|
||||
|
||||
func NewGlobalSnapshot(
|
||||
@ -21,12 +22,14 @@ func NewGlobalSnapshot(
|
||||
runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow,
|
||||
prebuildsInProgress []database.CountInProgressPrebuildsRow,
|
||||
backoffs []database.GetPresetsBackoffRow,
|
||||
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
|
||||
) GlobalSnapshot {
|
||||
return GlobalSnapshot{
|
||||
Presets: presets,
|
||||
RunningPrebuilds: runningPrebuilds,
|
||||
PrebuildsInProgress: prebuildsInProgress,
|
||||
Backoffs: backoffs,
|
||||
HardLimitedPresets: hardLimitedPresets,
|
||||
}
|
||||
}
|
||||
|
||||
@ -57,10 +60,15 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
|
||||
backoffPtr = &backoff
|
||||
}
|
||||
|
||||
_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
|
||||
return row.PresetID == preset.ID
|
||||
})
|
||||
|
||||
return &PresetSnapshot{
|
||||
Preset: preset,
|
||||
Running: running,
|
||||
InProgress: inProgress,
|
||||
Backoff: backoffPtr,
|
||||
Preset: preset,
|
||||
Running: running,
|
||||
InProgress: inProgress,
|
||||
Backoff: backoffPtr,
|
||||
IsHardLimited: isHardLimited,
|
||||
}, nil
|
||||
}
|
||||
|
@ -32,10 +32,11 @@ const (
|
||||
// It contains the raw data needed to calculate the current state of a preset's prebuilds,
|
||||
// including running prebuilds, in-progress builds, and backoff information.
|
||||
type PresetSnapshot struct {
|
||||
Preset database.GetTemplatePresetsWithPrebuildsRow
|
||||
Running []database.GetRunningPrebuiltWorkspacesRow
|
||||
InProgress []database.CountInProgressPrebuildsRow
|
||||
Backoff *database.GetPresetsBackoffRow
|
||||
Preset database.GetTemplatePresetsWithPrebuildsRow
|
||||
Running []database.GetRunningPrebuiltWorkspacesRow
|
||||
InProgress []database.CountInProgressPrebuildsRow
|
||||
Backoff *database.GetPresetsBackoffRow
|
||||
IsHardLimited bool
|
||||
}
|
||||
|
||||
// ReconciliationState represents the processed state of a preset's prebuilds,
|
||||
|
@ -73,7 +73,7 @@ func TestNoPrebuilds(t *testing.T) {
|
||||
preset(true, 0, current),
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -98,7 +98,7 @@ func TestNetNew(t *testing.T) {
|
||||
preset(true, 1, current),
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -138,7 +138,7 @@ func TestOutdatedPrebuilds(t *testing.T) {
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the outdated preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(outdated.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -200,7 +200,7 @@ func TestDeleteOutdatedPrebuilds(t *testing.T) {
|
||||
}
|
||||
|
||||
// WHEN: calculating the outdated preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(outdated.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -442,7 +442,7 @@ func TestInProgressActions(t *testing.T) {
|
||||
}
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -485,7 +485,7 @@ func TestExtraneous(t *testing.T) {
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -525,7 +525,7 @@ func TestDeprecated(t *testing.T) {
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -576,7 +576,7 @@ func TestLatestBuildFailed(t *testing.T) {
|
||||
}
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs, nil)
|
||||
psCurrent, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -669,7 +669,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil)
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil, nil)
|
||||
|
||||
// Nothing has to be created for preset 1.
|
||||
{
|
||||
|
Reference in New Issue
Block a user