mirror of
https://github.com/coder/coder.git
synced 2025-07-12 00:14:10 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -2226,6 +2226,15 @@ func (q *querier) GetPresetParametersByTemplateVersionID(ctx context.Context, ar
|
||||
return q.db.GetPresetParametersByTemplateVersionID(ctx, args)
|
||||
}
|
||||
|
||||
func (q *querier) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]database.GetPresetsAtFailureLimitRow, error) {
|
||||
// GetPresetsAtFailureLimit returns a list of template version presets that have reached the hard failure limit.
|
||||
// Request the same authorization permissions as GetPresetsBackoff, since the methods are similar.
|
||||
if err := q.authorizeContext(ctx, policy.ActionViewInsights, rbac.ResourceTemplate.All()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return q.db.GetPresetsAtFailureLimit(ctx, hardLimit)
|
||||
}
|
||||
|
||||
func (q *querier) GetPresetsBackoff(ctx context.Context, lookback time.Time) ([]database.GetPresetsBackoffRow, error) {
|
||||
// GetPresetsBackoff returns a list of template version presets along with metadata such as the number of failed prebuilds.
|
||||
if err := q.authorizeContext(ctx, policy.ActionViewInsights, rbac.ResourceTemplate.All()); err != nil {
|
||||
@ -4201,6 +4210,24 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas
|
||||
return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID)
|
||||
}
|
||||
|
||||
func (q *querier) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error {
|
||||
preset, err := q.db.GetPresetByID(ctx, arg.PresetID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
object := rbac.ResourceTemplate.
|
||||
WithID(preset.TemplateID.UUID).
|
||||
InOrg(preset.OrganizationID)
|
||||
|
||||
err = q.authorizeContext(ctx, policy.ActionUpdate, object)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return q.db.UpdatePresetPrebuildStatus(ctx, arg)
|
||||
}
|
||||
|
||||
func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error {
|
||||
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerDaemon); err != nil {
|
||||
return err
|
||||
|
@ -4924,6 +4924,11 @@ func (s *MethodTestSuite) TestPrebuilds() {
|
||||
Asserts(rbac.ResourceWorkspace.All(), policy.ActionRead).
|
||||
ErrorsWithInMemDB(dbmem.ErrUnimplemented)
|
||||
}))
|
||||
s.Run("GetPresetsAtFailureLimit", s.Subtest(func(_ database.Store, check *expects) {
|
||||
check.Args(int64(0)).
|
||||
Asserts(rbac.ResourceTemplate.All(), policy.ActionViewInsights).
|
||||
ErrorsWithInMemDB(dbmem.ErrUnimplemented)
|
||||
}))
|
||||
s.Run("GetPresetsBackoff", s.Subtest(func(_ database.Store, check *expects) {
|
||||
check.Args(time.Time{}).
|
||||
Asserts(rbac.ResourceTemplate.All(), policy.ActionViewInsights).
|
||||
@ -4971,8 +4976,34 @@ func (s *MethodTestSuite) TestPrebuilds() {
|
||||
},
|
||||
InvalidateAfterSecs: preset.InvalidateAfterSecs,
|
||||
OrganizationID: org.ID,
|
||||
PrebuildStatus: database.PrebuildStatusHealthy,
|
||||
})
|
||||
}))
|
||||
s.Run("UpdatePresetPrebuildStatus", s.Subtest(func(db database.Store, check *expects) {
|
||||
org := dbgen.Organization(s.T(), db, database.Organization{})
|
||||
user := dbgen.User(s.T(), db, database.User{})
|
||||
template := dbgen.Template(s.T(), db, database.Template{
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
templateVersion := dbgen.TemplateVersion(s.T(), db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{
|
||||
UUID: template.ID,
|
||||
Valid: true,
|
||||
},
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
preset := dbgen.Preset(s.T(), db, database.InsertPresetParams{
|
||||
TemplateVersionID: templateVersion.ID,
|
||||
})
|
||||
req := database.UpdatePresetPrebuildStatusParams{
|
||||
PresetID: preset.ID,
|
||||
Status: database.PrebuildStatusHealthy,
|
||||
}
|
||||
check.Args(req).
|
||||
Asserts(rbac.ResourceTemplate.WithID(template.ID).InOrg(org.ID), policy.ActionUpdate)
|
||||
}))
|
||||
}
|
||||
|
||||
func (s *MethodTestSuite) TestOAuth2ProviderApps() {
|
||||
|
Reference in New Issue
Block a user