fix: reduce cost of prebuild failure (#17697)

Relates to https://github.com/coder/coder/issues/17432

### Part 1:

Notes:
- `GetPresetsAtFailureLimit` SQL query is added, which is similar to
`GetPresetsBackoff`, they use same CTEs: `filtered_builds`,
`time_sorted_builds`, but they are still different.

- Query is executed on every loop iteration. We can consider marking
specific preset as permanently failed as an optimization to avoid
executing query on every loop iteration. But I decided don't do it for
now.

- By default `FailureHardLimit` is set to 3.

- `FailureHardLimit` is configurable. Setting it to zero - means that
hard limit is disabled.

### Part 2

Notes:
- `PrebuildFailureLimitReached` notification is added.
- Notification is sent to template admins.
- Notification is sent only the first time, when hard limit is reached.
But it will `log.Warn` on every loop iteration.
- I introduced this enum:
```sql
CREATE TYPE prebuild_status AS ENUM (
  'normal',           -- Prebuilds are working as expected; this is the default, healthy state.
  'hard_limited',     -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore.
  'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried.
);
```
`validation_failed` not used in this PR, but I think it will be used in
next one, so I wanted to save us an extra migration.

- Notification looks like this:
<img width="472" alt="image"
src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27"
/>

### Latest notification views:
<img width="463" alt="image"
src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe"
/>
<img width="725" alt="image"
src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a"
/>
This commit is contained in:
Yevhenii Shcherbina
2025-05-21 15:16:38 -04:00
committed by GitHub
parent e1934fe119
commit 53e8e9c7cd
32 changed files with 1160 additions and 60 deletions

View File

@ -1343,6 +1343,67 @@ func AllPortShareProtocolValues() []PortShareProtocol {
}
}
type PrebuildStatus string
const (
PrebuildStatusHealthy PrebuildStatus = "healthy"
PrebuildStatusHardLimited PrebuildStatus = "hard_limited"
PrebuildStatusValidationFailed PrebuildStatus = "validation_failed"
)
func (e *PrebuildStatus) Scan(src interface{}) error {
switch s := src.(type) {
case []byte:
*e = PrebuildStatus(s)
case string:
*e = PrebuildStatus(s)
default:
return fmt.Errorf("unsupported scan type for PrebuildStatus: %T", src)
}
return nil
}
type NullPrebuildStatus struct {
PrebuildStatus PrebuildStatus `json:"prebuild_status"`
Valid bool `json:"valid"` // Valid is true if PrebuildStatus is not NULL
}
// Scan implements the Scanner interface.
func (ns *NullPrebuildStatus) Scan(value interface{}) error {
if value == nil {
ns.PrebuildStatus, ns.Valid = "", false
return nil
}
ns.Valid = true
return ns.PrebuildStatus.Scan(value)
}
// Value implements the driver Valuer interface.
func (ns NullPrebuildStatus) Value() (driver.Value, error) {
if !ns.Valid {
return nil, nil
}
return string(ns.PrebuildStatus), nil
}
func (e PrebuildStatus) Valid() bool {
switch e {
case PrebuildStatusHealthy,
PrebuildStatusHardLimited,
PrebuildStatusValidationFailed:
return true
}
return false
}
func AllPrebuildStatusValues() []PrebuildStatus {
return []PrebuildStatus{
PrebuildStatusHealthy,
PrebuildStatusHardLimited,
PrebuildStatusValidationFailed,
}
}
// The status of a provisioner daemon.
type ProvisionerDaemonStatus string
@ -3248,12 +3309,13 @@ type TemplateVersionParameter struct {
}
type TemplateVersionPreset struct {
ID uuid.UUID `db:"id" json:"id"`
TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"`
Name string `db:"name" json:"name"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"`
InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"`
ID uuid.UUID `db:"id" json:"id"`
TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"`
Name string `db:"name" json:"name"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"`
InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"`
PrebuildStatus PrebuildStatus `db:"prebuild_status" json:"prebuild_status"`
}
type TemplateVersionPresetParameter struct {