feat: track resource replacements when claiming a prebuilt workspace (#17571)

Closes https://github.com/coder/internal/issues/369

We can't know whether a replacement (i.e. drift of terraform state
leading to a resource needing to be deleted/recreated) will take place
apriori; we can only detect it at `plan` time, because the provider
decides whether a resource must be replaced and it cannot be inferred
through static analysis of the template.

**This is likely to be the most common gotcha with using prebuilds,
since it requires a slight template modification to use prebuilds
effectively**, so let's head this off before it's an issue for
customers.

Drift details will now be logged in the workspace build logs:


![image](https://github.com/user-attachments/assets/da1988b6-2cbe-4a79-a3c5-ea29891f3d6f)

Plus a notification will be sent to template admins when this situation
arises:


![image](https://github.com/user-attachments/assets/39d555b1-a262-4a3e-b529-03b9f23bf66a)

A new metric - `coderd_prebuilt_workspaces_resource_replacements_total`
- will also increment each time a workspace encounters replacements.

We only track _that_ a resource replacement occurred, not how many. Just
one is enough to ruin a prebuild, but we can't know apriori which
replacement would cause this.
For example, say we have 2 replacements: a `docker_container` and a
`null_resource`; we don't know which one might
cause an issue (or indeed if either would), so we just track the
replacement.

---------

Signed-off-by: Danny Kopping <dannykopping@gmail.com>
This commit is contained in:
Danny Kopping
2025-05-14 14:52:22 +02:00
committed by GitHub
parent e75d1c1ce5
commit 6e967780c9
33 changed files with 2048 additions and 969 deletions

View File

@ -928,6 +928,37 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
options.StatsBatcher = batcher
defer closeBatcher()
// Manage notifications.
var (
notificationsCfg = options.DeploymentValues.Notifications
notificationsManager *notifications.Manager
)
metrics := notifications.NewMetrics(options.PrometheusRegistry)
helpers := templateHelpers(options)
// The enqueuer is responsible for enqueueing notifications to the given store.
enqueuer, err := notifications.NewStoreEnqueuer(notificationsCfg, options.Database, helpers, logger.Named("notifications.enqueuer"), quartz.NewReal())
if err != nil {
return xerrors.Errorf("failed to instantiate notification store enqueuer: %w", err)
}
options.NotificationsEnqueuer = enqueuer
// The notification manager is responsible for:
// - creating notifiers and managing their lifecycles (notifiers are responsible for dequeueing/sending notifications)
// - keeping the store updated with status updates
notificationsManager, err = notifications.NewManager(notificationsCfg, options.Database, options.Pubsub, helpers, metrics, logger.Named("notifications.manager"))
if err != nil {
return xerrors.Errorf("failed to instantiate notification manager: %w", err)
}
// nolint:gocritic // We need to run the manager in a notifier context.
notificationsManager.Run(dbauthz.AsNotifier(ctx))
// Run report generator to distribute periodic reports.
notificationReportGenerator := reports.NewReportGenerator(ctx, logger.Named("notifications.report_generator"), options.Database, options.NotificationsEnqueuer, quartz.NewReal())
defer notificationReportGenerator.Close()
// We use a separate coderAPICloser so the Enterprise API
// can have its own close functions. This is cleaner
// than abstracting the Coder API itself.
@ -975,37 +1006,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
return xerrors.Errorf("write config url: %w", err)
}
// Manage notifications.
var (
notificationsCfg = options.DeploymentValues.Notifications
notificationsManager *notifications.Manager
)
metrics := notifications.NewMetrics(options.PrometheusRegistry)
helpers := templateHelpers(options)
// The enqueuer is responsible for enqueueing notifications to the given store.
enqueuer, err := notifications.NewStoreEnqueuer(notificationsCfg, options.Database, helpers, logger.Named("notifications.enqueuer"), quartz.NewReal())
if err != nil {
return xerrors.Errorf("failed to instantiate notification store enqueuer: %w", err)
}
options.NotificationsEnqueuer = enqueuer
// The notification manager is responsible for:
// - creating notifiers and managing their lifecycles (notifiers are responsible for dequeueing/sending notifications)
// - keeping the store updated with status updates
notificationsManager, err = notifications.NewManager(notificationsCfg, options.Database, options.Pubsub, helpers, metrics, logger.Named("notifications.manager"))
if err != nil {
return xerrors.Errorf("failed to instantiate notification manager: %w", err)
}
// nolint:gocritic // We need to run the manager in a notifier context.
notificationsManager.Run(dbauthz.AsNotifier(ctx))
// Run report generator to distribute periodic reports.
notificationReportGenerator := reports.NewReportGenerator(ctx, logger.Named("notifications.report_generator"), options.Database, options.NotificationsEnqueuer, quartz.NewReal())
defer notificationReportGenerator.Close()
// Since errCh only has one buffered slot, all routines
// sending on it must be wrapped in a select/default to
// avoid leaving dangling goroutines waiting for the