fix: remove notifications for hard-limited prebuilds (#18528)

Relates to https://github.com/coder/internal/issues/674


Currently, we send notifications to **all template admins** for **every
failed and hard-limited preset**. This can generate excessive
noise—especially when someone is debugging a template and creates
multiple broken versions in quick succession.

For now, we've decided to remove hard-limited preset notifications to
reduce excessive noise.
In the long term, we plan to aggregate failure information and deliver
it on a daily or weekly basis.
This commit is contained in:
Yevhenii Shcherbina
2025-06-24 08:43:16 -04:00
committed by GitHub
parent 7b152cdd91
commit bca5c35aa2
2 changed files with 0 additions and 87 deletions

View File

@ -427,7 +427,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
logger.Warn(ctx, "preset is hard limited, notifying template admins")
@ -439,11 +438,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
if err != nil {
return xerrors.Errorf("failed to update preset prebuild status: %w", err)
}
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
}
}
state := ps.CalculateState()
@ -474,49 +468,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
return multiErr.ErrorOrNil()
}
func (c *StoreReconciler) notifyPrebuildFailureLimitReached(ctx context.Context, ps prebuilds.PresetSnapshot) error {
// nolint:gocritic // Necessary to query all the required data.
ctx = dbauthz.AsSystemRestricted(ctx)
// Send notification to template admins.
if c.notifEnq == nil {
c.logger.Warn(ctx, "notification enqueuer not set, cannot send prebuild is hard limited notification(s)")
return nil
}
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin},
})
if err != nil {
return xerrors.Errorf("fetch template admins: %w", err)
}
for _, templateAdmin := range templateAdmins {
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached,
map[string]string{
"org": ps.Preset.OrganizationName,
"template": ps.Preset.TemplateName,
"template_version": ps.Preset.TemplateVersionName,
"preset": ps.Preset.Name,
},
map[string]any{},
"prebuilds_reconciler",
// Associate this notification with all the related entities.
ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID, ps.Preset.OrganizationID,
); err != nil {
c.logger.Error(ctx,
"failed to send notification",
slog.Error(err),
slog.F("template_admin_id", templateAdmin.ID.String()),
)
continue
}
}
return nil
}
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) ([]*prebuilds.ReconciliationActions, error) {
if ctx.Err() != nil {
return nil, ctx.Err()

View File

@ -853,11 +853,6 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
controller := prebuilds.NewStoreReconciler(db, pubSub, cache, cfg, logger, clock, registry, fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
RBACRoles: []string{codersdk.RoleTemplateAdmin},
})
// Set up test environment with a template, version, and preset.
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -938,20 +933,6 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
require.Equal(t, 1, len(workspaces))
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
// When hard limit is reached, a notification should be sent.
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
return false
}
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
return false
}
return true
})
require.Len(t, matching, 1)
// When hard limit is reached, metric is set to 1.
mf, err = registry.Gather()
require.NoError(t, err)
@ -1016,11 +997,6 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
controller := prebuilds.NewStoreReconciler(db, pubSub, cache, cfg, logger, clock, registry, fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
RBACRoles: []string{codersdk.RoleTemplateAdmin},
})
// Set up test environment with a template, version, and preset.
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -1125,20 +1101,6 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.NoError(t, err)
require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
// When hard limit is reached, a notification should be sent.
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
return false
}
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
return false
}
return true
})
require.Len(t, matching, 1)
// When hard limit is reached, metric is set to 1.
mf, err = registry.Gather()
require.NoError(t, err)