Files
coder/coderd/prebuilds/global_snapshot.go
Yevhenii Shcherbina 53e8e9c7cd fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432

### Part 1:

Notes:
- `GetPresetsAtFailureLimit` SQL query is added, which is similar to
`GetPresetsBackoff`, they use same CTEs: `filtered_builds`,
`time_sorted_builds`, but they are still different.

- Query is executed on every loop iteration. We can consider marking
specific preset as permanently failed as an optimization to avoid
executing query on every loop iteration. But I decided don't do it for
now.

- By default `FailureHardLimit` is set to 3.

- `FailureHardLimit` is configurable. Setting it to zero - means that
hard limit is disabled.

### Part 2

Notes:
- `PrebuildFailureLimitReached` notification is added.
- Notification is sent to template admins.
- Notification is sent only the first time, when hard limit is reached.
But it will `log.Warn` on every loop iteration.
- I introduced this enum:
```sql
CREATE TYPE prebuild_status AS ENUM (
  'normal',           -- Prebuilds are working as expected; this is the default, healthy state.
  'hard_limited',     -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore.
  'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried.
);
```
`validation_failed` not used in this PR, but I think it will be used in
next one, so I wanted to save us an extra migration.

- Notification looks like this:
<img width="472" alt="image"
src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27"
/>

### Latest notification views:
<img width="463" alt="image"
src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe"
/>
<img width="725" alt="image"
src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a"
/>
2025-05-21 15:16:38 -04:00

75 lines
2.3 KiB
Go

package prebuilds
import (
"github.com/google/uuid"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/slice"
)
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
type GlobalSnapshot struct {
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
}
func NewGlobalSnapshot(
presets []database.GetTemplatePresetsWithPrebuildsRow,
runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow,
prebuildsInProgress []database.CountInProgressPrebuildsRow,
backoffs []database.GetPresetsBackoffRow,
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
) GlobalSnapshot {
return GlobalSnapshot{
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
HardLimitedPresets: hardLimitedPresets,
}
}
func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, error) {
preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
return preset.ID == presetID
})
if !found {
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
}
running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuiltWorkspacesRow) bool {
if !prebuild.CurrentPresetID.Valid {
return false
}
return prebuild.CurrentPresetID.UUID == preset.ID
})
inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.CountInProgressPrebuildsRow) bool {
return prebuild.PresetID.UUID == preset.ID
})
var backoffPtr *database.GetPresetsBackoffRow
backoff, found := slice.Find(s.Backoffs, func(row database.GetPresetsBackoffRow) bool {
return row.PresetID == preset.ID
})
if found {
backoffPtr = &backoff
}
_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
return row.PresetID == preset.ID
})
return &PresetSnapshot{
Preset: preset,
Running: running,
InProgress: inProgress,
Backoff: backoffPtr,
IsHardLimited: isHardLimited,
}, nil
}